summaryrefslogtreecommitdiffstats
path: root/src/video_core/buffer_cache
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core/buffer_cache')
-rw-r--r--src/video_core/buffer_cache/buffer_base.h217
-rw-r--r--src/video_core/buffer_cache/buffer_block.h62
-rw-r--r--src/video_core/buffer_cache/buffer_cache.cpp13
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h1656
-rw-r--r--src/video_core/buffer_cache/map_interval.cpp33
-rw-r--r--src/video_core/buffer_cache/map_interval.h93
6 files changed, 1346 insertions, 728 deletions
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index ee8602ce9..0c00ae280 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -19,6 +19,7 @@ namespace VideoCommon {
enum class BufferFlagBits {
Picked = 1 << 0,
+ CachedWrites = 1 << 1,
};
DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits)
@@ -40,7 +41,7 @@ class BufferBase {
static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE;
/// Vector tracking modified pages tightly packed with small vector optimization
- union WrittenWords {
+ union WordsArray {
/// Returns the pointer to the words state
[[nodiscard]] const u64* Pointer(bool is_short) const noexcept {
return is_short ? &stack : heap;
@@ -55,49 +56,59 @@ class BufferBase {
u64* heap; ///< Not-small buffers pointer to the storage
};
- struct GpuCpuWords {
- explicit GpuCpuWords() = default;
- explicit GpuCpuWords(u64 size_bytes_) : size_bytes{size_bytes_} {
+ struct Words {
+ explicit Words() = default;
+ explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} {
if (IsShort()) {
cpu.stack = ~u64{0};
gpu.stack = 0;
+ cached_cpu.stack = 0;
+ untracked.stack = ~u64{0};
} else {
// Share allocation between CPU and GPU pages and set their default values
const size_t num_words = NumWords();
- u64* const alloc = new u64[num_words * 2];
+ u64* const alloc = new u64[num_words * 4];
cpu.heap = alloc;
gpu.heap = alloc + num_words;
+ cached_cpu.heap = alloc + num_words * 2;
+ untracked.heap = alloc + num_words * 3;
std::fill_n(cpu.heap, num_words, ~u64{0});
std::fill_n(gpu.heap, num_words, 0);
+ std::fill_n(cached_cpu.heap, num_words, 0);
+ std::fill_n(untracked.heap, num_words, ~u64{0});
}
// Clean up tailing bits
- const u64 last_local_page =
- Common::DivCeil(size_bytes % BYTES_PER_WORD, BYTES_PER_PAGE);
+ const u64 last_word_size = size_bytes % BYTES_PER_WORD;
+ const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE);
const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD;
- u64& last_word = cpu.Pointer(IsShort())[NumWords() - 1];
- last_word = (last_word << shift) >> shift;
+ const u64 last_word = (~u64{0} << shift) >> shift;
+ cpu.Pointer(IsShort())[NumWords() - 1] = last_word;
+ untracked.Pointer(IsShort())[NumWords() - 1] = last_word;
}
- ~GpuCpuWords() {
+ ~Words() {
Release();
}
- GpuCpuWords& operator=(GpuCpuWords&& rhs) noexcept {
+ Words& operator=(Words&& rhs) noexcept {
Release();
size_bytes = rhs.size_bytes;
cpu = rhs.cpu;
gpu = rhs.gpu;
+ cached_cpu = rhs.cached_cpu;
+ untracked = rhs.untracked;
rhs.cpu.heap = nullptr;
return *this;
}
- GpuCpuWords(GpuCpuWords&& rhs) noexcept
- : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu} {
+ Words(Words&& rhs) noexcept
+ : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu},
+ cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} {
rhs.cpu.heap = nullptr;
}
- GpuCpuWords& operator=(const GpuCpuWords&) = delete;
- GpuCpuWords(const GpuCpuWords&) = delete;
+ Words& operator=(const Words&) = delete;
+ Words(const Words&) = delete;
/// Returns true when the buffer fits in the small vector optimization
[[nodiscard]] bool IsShort() const noexcept {
@@ -118,8 +129,17 @@ class BufferBase {
}
u64 size_bytes = 0;
- WrittenWords cpu;
- WrittenWords gpu;
+ WordsArray cpu;
+ WordsArray gpu;
+ WordsArray cached_cpu;
+ WordsArray untracked;
+ };
+
+ enum class Type {
+ CPU,
+ GPU,
+ CachedCPU,
+ Untracked,
};
public:
@@ -132,68 +152,93 @@ public:
BufferBase& operator=(const BufferBase&) = delete;
BufferBase(const BufferBase&) = delete;
+ BufferBase& operator=(BufferBase&&) = default;
+ BufferBase(BufferBase&&) = default;
+
/// Returns the inclusive CPU modified range in a begin end pair
[[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr,
u64 query_size) const noexcept {
const u64 offset = query_cpu_addr - cpu_addr;
- return ModifiedRegion<false>(offset, query_size);
+ return ModifiedRegion<Type::CPU>(offset, query_size);
}
/// Returns the inclusive GPU modified range in a begin end pair
[[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr,
u64 query_size) const noexcept {
const u64 offset = query_cpu_addr - cpu_addr;
- return ModifiedRegion<true>(offset, query_size);
+ return ModifiedRegion<Type::GPU>(offset, query_size);
}
/// Returns true if a region has been modified from the CPU
[[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
const u64 offset = query_cpu_addr - cpu_addr;
- return IsRegionModified<false>(offset, query_size);
+ return IsRegionModified<Type::CPU>(offset, query_size);
}
/// Returns true if a region has been modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
const u64 offset = query_cpu_addr - cpu_addr;
- return IsRegionModified<true>(offset, query_size);
+ return IsRegionModified<Type::GPU>(offset, query_size);
}
/// Mark region as CPU modified, notifying the rasterizer about this change
void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
- ChangeRegionState<true, true>(words.cpu, dirty_cpu_addr, size);
+ ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size);
}
/// Unmark region as CPU modified, notifying the rasterizer about this change
void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
- ChangeRegionState<false, true>(words.cpu, dirty_cpu_addr, size);
+ ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size);
}
/// Mark region as modified from the host GPU
void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
- ChangeRegionState<true, false>(words.gpu, dirty_cpu_addr, size);
+ ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size);
}
/// Unmark region as modified from the host GPU
void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
- ChangeRegionState<false, false>(words.gpu, dirty_cpu_addr, size);
+ ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size);
+ }
+
+ /// Mark region as modified from the CPU
+ /// but don't mark it as modified until FlusHCachedWrites is called.
+ void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) {
+ flags |= BufferFlagBits::CachedWrites;
+ ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size);
+ }
+
+ /// Flushes cached CPU writes, and notify the rasterizer about the deltas
+ void FlushCachedWrites() noexcept {
+ flags &= ~BufferFlagBits::CachedWrites;
+ const u64 num_words = NumWords();
+ const u64* const cached_words = Array<Type::CachedCPU>();
+ u64* const untracked_words = Array<Type::Untracked>();
+ u64* const cpu_words = Array<Type::CPU>();
+ for (u64 word_index = 0; word_index < num_words; ++word_index) {
+ const u64 cached_bits = cached_words[word_index];
+ NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits);
+ untracked_words[word_index] |= cached_bits;
+ cpu_words[word_index] |= cached_bits;
+ }
}
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified
template <typename Func>
void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) {
- ForEachModifiedRange<false, true>(query_cpu_range, size, func);
+ ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func);
}
/// Call 'func' for each GPU modified range and unmark those pages as GPU modified
template <typename Func>
void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) {
- ForEachModifiedRange<true, false>(query_cpu_range, size, func);
+ ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func);
}
/// Call 'func' for each GPU modified range and unmark those pages as GPU modified
template <typename Func>
void ForEachDownloadRange(Func&& func) {
- ForEachModifiedRange<true, false>(cpu_addr, SizeBytes(), func);
+ ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func);
}
/// Mark buffer as picked
@@ -206,6 +251,16 @@ public:
flags &= ~BufferFlagBits::Picked;
}
+ /// Increases the likeliness of this being a stream buffer
+ void IncreaseStreamScore(int score) noexcept {
+ stream_score += score;
+ }
+
+ /// Returns the likeliness of this being a stream buffer
+ [[nodiscard]] int StreamScore() const noexcept {
+ return stream_score;
+ }
+
/// Returns true when vaddr -> vaddr+size is fully contained in the buffer
[[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept {
return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes();
@@ -216,6 +271,11 @@ public:
return True(flags & BufferFlagBits::Picked);
}
+ /// Returns true when the buffer has pending cached writes
+ [[nodiscard]] bool HasCachedWrites() const noexcept {
+ return True(flags & BufferFlagBits::CachedWrites);
+ }
+
/// Returns the base CPU address of the buffer
[[nodiscard]] VAddr CpuAddr() const noexcept {
return cpu_addr;
@@ -233,26 +293,48 @@ public:
}
private:
+ template <Type type>
+ u64* Array() noexcept {
+ if constexpr (type == Type::CPU) {
+ return words.cpu.Pointer(IsShort());
+ } else if constexpr (type == Type::GPU) {
+ return words.gpu.Pointer(IsShort());
+ } else if constexpr (type == Type::CachedCPU) {
+ return words.cached_cpu.Pointer(IsShort());
+ } else if constexpr (type == Type::Untracked) {
+ return words.untracked.Pointer(IsShort());
+ }
+ }
+
+ template <Type type>
+ const u64* Array() const noexcept {
+ if constexpr (type == Type::CPU) {
+ return words.cpu.Pointer(IsShort());
+ } else if constexpr (type == Type::GPU) {
+ return words.gpu.Pointer(IsShort());
+ } else if constexpr (type == Type::CachedCPU) {
+ return words.cached_cpu.Pointer(IsShort());
+ } else if constexpr (type == Type::Untracked) {
+ return words.untracked.Pointer(IsShort());
+ }
+ }
+
/**
* Change the state of a range of pages
*
- * @param written_words Pages to be marked or unmarked as modified
* @param dirty_addr Base address to mark or unmark as modified
* @param size Size in bytes to mark or unmark as modified
- *
- * @tparam enable True when the bits will be set to one, false for zero
- * @tparam notify_rasterizer True when the rasterizer has to be notified about the changes
*/
- template <bool enable, bool notify_rasterizer>
- void ChangeRegionState(WrittenWords& written_words, u64 dirty_addr,
- s64 size) noexcept(!notify_rasterizer) {
+ template <Type type, bool enable>
+ void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) {
const s64 difference = dirty_addr - cpu_addr;
const u64 offset = std::max<s64>(difference, 0);
size += std::min<s64>(difference, 0);
if (offset >= SizeBytes() || size < 0) {
return;
}
- u64* const state_words = written_words.Pointer(IsShort());
+ u64* const untracked_words = Array<Type::Untracked>();
+ u64* const state_words = Array<type>();
const u64 offset_end = std::min(offset + size, SizeBytes());
const u64 begin_page_index = offset / BYTES_PER_PAGE;
const u64 begin_word_index = begin_page_index / PAGES_PER_WORD;
@@ -268,13 +350,19 @@ private:
u64 bits = ~u64{0};
bits = (bits >> right_offset) << right_offset;
bits = (bits << left_offset) >> left_offset;
- if constexpr (notify_rasterizer) {
- NotifyRasterizer<!enable>(word_index, state_words[word_index], bits);
+ if constexpr (type == Type::CPU || type == Type::CachedCPU) {
+ NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits);
}
if constexpr (enable) {
state_words[word_index] |= bits;
+ if constexpr (type == Type::CPU || type == Type::CachedCPU) {
+ untracked_words[word_index] |= bits;
+ }
} else {
state_words[word_index] &= ~bits;
+ if constexpr (type == Type::CPU || type == Type::CachedCPU) {
+ untracked_words[word_index] &= ~bits;
+ }
}
page_index = 0;
++word_index;
@@ -291,7 +379,7 @@ private:
* @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages
*/
template <bool add_to_rasterizer>
- void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) {
+ void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits;
VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
while (changed_bits != 0) {
@@ -315,21 +403,20 @@ private:
* @param query_cpu_range Base CPU address to loop over
* @param size Size in bytes of the CPU range to loop over
* @param func Function to call for each turned off region
- *
- * @tparam gpu True for host GPU pages, false for CPU pages
- * @tparam notify_rasterizer True when the rasterizer should be notified about state changes
*/
- template <bool gpu, bool notify_rasterizer, typename Func>
+ template <Type type, typename Func>
void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
+ static_assert(type != Type::Untracked);
+
const s64 difference = query_cpu_range - cpu_addr;
const u64 query_begin = std::max<s64>(difference, 0);
size += std::min<s64>(difference, 0);
if (query_begin >= SizeBytes() || size < 0) {
return;
}
- const u64* const cpu_words = words.cpu.Pointer(IsShort());
+ u64* const untracked_words = Array<Type::Untracked>();
+ u64* const state_words = Array<type>();
const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
- u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD);
@@ -345,7 +432,8 @@ private:
const u64 word_index_end = std::distance(state_words, last_modified_word);
const unsigned local_page_begin = std::countr_zero(*first_modified_word);
- const unsigned local_page_end = PAGES_PER_WORD - std::countl_zero(last_modified_word[-1]);
+ const unsigned local_page_end =
+ static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]);
const u64 word_page_begin = word_index_begin * PAGES_PER_WORD;
const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD;
const u64 query_page_begin = query_begin / BYTES_PER_PAGE;
@@ -371,11 +459,13 @@ private:
const u64 current_word = state_words[word_index] & bits;
state_words[word_index] &= ~bits;
- // Exclude CPU modified pages when visiting GPU pages
- const u64 word = current_word & ~(gpu ? cpu_words[word_index] : 0);
- if constexpr (notify_rasterizer) {
- NotifyRasterizer<true>(word_index, word, ~u64{0});
+ if constexpr (type == Type::CPU) {
+ const u64 current_bits = untracked_words[word_index] & bits;
+ untracked_words[word_index] &= ~bits;
+ NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
}
+ // Exclude CPU modified pages when visiting GPU pages
+ const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
u64 page = page_begin;
page_begin = 0;
@@ -416,17 +506,20 @@ private:
* @param offset Offset in bytes from the start of the buffer
* @param size Size in bytes of the region to query for modifications
*/
- template <bool gpu>
+ template <Type type>
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
- const u64* const cpu_words = words.cpu.Pointer(IsShort());
- const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
+ static_assert(type != Type::Untracked);
+
+ const u64* const untracked_words = Array<Type::Untracked>();
+ const u64* const state_words = Array<type>();
const u64 num_query_words = size / BYTES_PER_WORD + 1;
const u64 word_begin = offset / BYTES_PER_WORD;
const u64 word_end = std::min(word_begin + num_query_words, NumWords());
const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
- const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0);
+ const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
+ const u64 word = state_words[word_index] & ~off_word;
if (word == 0) {
continue;
}
@@ -445,13 +538,13 @@ private:
*
* @param offset Offset in bytes from the start of the buffer
* @param size Size in bytes of the region to query for modifications
- *
- * @tparam gpu True to query GPU modified pages, false for CPU pages
*/
- template <bool gpu>
+ template <Type type>
[[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
- const u64* const cpu_words = words.cpu.Pointer(IsShort());
- const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
+ static_assert(type != Type::Untracked);
+
+ const u64* const untracked_words = Array<Type::Untracked>();
+ const u64* const state_words = Array<type>();
const u64 num_query_words = size / BYTES_PER_WORD + 1;
const u64 word_begin = offset / BYTES_PER_WORD;
const u64 word_end = std::min(word_begin + num_query_words, NumWords());
@@ -460,7 +553,8 @@ private:
u64 begin = std::numeric_limits<u64>::max();
u64 end = 0;
for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
- const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0);
+ const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
+ const u64 word = state_words[word_index] & ~off_word;
if (word == 0) {
continue;
}
@@ -488,8 +582,9 @@ private:
RasterizerInterface* rasterizer = nullptr;
VAddr cpu_addr = 0;
- GpuCpuWords words;
+ Words words;
BufferFlagBits flags{};
+ int stream_score = 0;
};
} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
deleted file mode 100644
index e9306194a..000000000
--- a/src/video_core/buffer_cache/buffer_block.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include "common/common_types.h"
-
-namespace VideoCommon {
-
-class BufferBlock {
-public:
- [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
- return (cpu_addr < end) && (cpu_addr_end > start);
- }
-
- [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
- return cpu_addr <= other_start && other_end <= cpu_addr_end;
- }
-
- [[nodiscard]] std::size_t Offset(VAddr in_addr) const {
- return static_cast<std::size_t>(in_addr - cpu_addr);
- }
-
- [[nodiscard]] VAddr CpuAddr() const {
- return cpu_addr;
- }
-
- [[nodiscard]] VAddr CpuAddrEnd() const {
- return cpu_addr_end;
- }
-
- void SetCpuAddr(VAddr new_addr) {
- cpu_addr = new_addr;
- cpu_addr_end = new_addr + size;
- }
-
- [[nodiscard]] std::size_t Size() const {
- return size;
- }
-
- [[nodiscard]] u64 Epoch() const {
- return epoch;
- }
-
- void SetEpoch(u64 new_epoch) {
- epoch = new_epoch;
- }
-
-protected:
- explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
- SetCpuAddr(cpu_addr_);
- }
-
-private:
- VAddr cpu_addr{};
- VAddr cpu_addr_end{};
- std::size_t size{};
- u64 epoch{};
-};
-
-} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
new file mode 100644
index 000000000..ab32294c8
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -0,0 +1,13 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/microprofile.h"
+
+namespace VideoCommon {
+
+MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128));
+MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
+MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
+
+} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 83b9ee871..2a6844ab1 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -4,591 +4,1289 @@
#pragma once
-#include <list>
+#include <algorithm>
+#include <array>
+#include <deque>
#include <memory>
#include <mutex>
+#include <span>
#include <unordered_map>
-#include <unordered_set>
-#include <utility>
#include <vector>
#include <boost/container/small_vector.hpp>
-#include <boost/icl/interval_set.hpp>
-#include <boost/intrusive/set.hpp>
-#include "common/alignment.h"
-#include "common/assert.h"
#include "common/common_types.h"
-#include "common/logging/log.h"
-#include "core/core.h"
+#include "common/div_ceil.h"
+#include "common/microprofile.h"
+#include "common/scope_exit.h"
#include "core/memory.h"
#include "core/settings.h"
-#include "video_core/buffer_cache/buffer_block.h"
-#include "video_core/buffer_cache/map_interval.h"
+#include "video_core/buffer_cache/buffer_base.h"
+#include "video_core/delayed_destruction_ring.h"
+#include "video_core/dirty_flags.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
+#include "video_core/texture_cache/slot_vector.h"
+#include "video_core/texture_cache/types.h"
namespace VideoCommon {
-template <typename Buffer, typename BufferType, typename StreamBuffer>
+MICROPROFILE_DECLARE(GPU_PrepareBuffers);
+MICROPROFILE_DECLARE(GPU_BindUploadBuffers);
+MICROPROFILE_DECLARE(GPU_DownloadMemory);
+
+using BufferId = SlotId;
+
+constexpr u32 NUM_VERTEX_BUFFERS = 32;
+constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
+constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
+constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
+constexpr u32 NUM_STORAGE_BUFFERS = 16;
+constexpr u32 NUM_STAGES = 5;
+
+template <typename P>
class BufferCache {
- using IntervalSet = boost::icl::interval_set<VAddr>;
- using IntervalType = typename IntervalSet::interval_type;
- using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
+ // Page size for caching purposes.
+ // This is unrelated to the CPU page size and it can be changed as it seems optimal.
+ static constexpr u32 PAGE_BITS = 16;
+ static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS;
+
+ static constexpr bool IS_OPENGL = P::IS_OPENGL;
+ static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
+ P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS;
+ static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT =
+ P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT;
+ static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
+ static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
+ static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
+
+ static constexpr BufferId NULL_BUFFER_ID{0};
+
+ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+ using Runtime = typename P::Runtime;
+ using Buffer = typename P::Buffer;
+
+ struct Empty {};
+
+ struct OverlapResult {
+ std::vector<BufferId> ids;
+ VAddr begin;
+ VAddr end;
+ bool has_stream_leap = false;
+ };
- static constexpr u64 WRITE_PAGE_BIT = 11;
- static constexpr u64 BLOCK_PAGE_BITS = 21;
- static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+ struct Binding {
+ VAddr cpu_addr{};
+ u32 size{};
+ BufferId buffer_id;
+ };
-public:
- struct BufferInfo {
- BufferType handle;
- u64 offset;
- u64 address;
+ static constexpr Binding NULL_BINDING{
+ .cpu_addr = 0,
+ .size = 0,
+ .buffer_id = NULL_BUFFER_ID,
};
- BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
- bool is_written = false, bool use_fast_cbuf = false) {
- std::lock_guard lock{mutex};
+public:
+ static constexpr u32 SKIP_CACHE_SIZE = 4096;
- const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
- if (!cpu_addr) {
- return GetEmptyBuffer(size);
- }
+ explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
+ Tegra::Engines::Maxwell3D& maxwell3d_,
+ Tegra::Engines::KeplerCompute& kepler_compute_,
+ Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+ Runtime& runtime_);
- // Cache management is a big overhead, so only cache entries with a given size.
- // TODO: Figure out which size is the best for given games.
- constexpr std::size_t max_stream_size = 0x800;
- if (use_fast_cbuf || size < max_stream_size) {
- if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) {
- const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
- if (use_fast_cbuf) {
- u8* dest;
- if (is_granular) {
- dest = gpu_memory.GetPointer(gpu_addr);
- } else {
- staging_buffer.resize(size);
- dest = staging_buffer.data();
- gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
- }
- return ConstBufferUpload(dest, size);
- }
- if (is_granular) {
- u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
- return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
- std::memcpy(dest, host_ptr, size);
- });
- } else {
- return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
- gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
- });
- }
- }
- }
+ void TickFrame();
- Buffer* const block = GetBlock(*cpu_addr, size);
- MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
- if (!map) {
- return GetEmptyBuffer(size);
- }
- if (is_written) {
- map->MarkAsModified(true, GetModifiedTicks());
- if (Settings::IsGPULevelHigh() &&
- Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
- MarkForAsyncFlush(map);
- }
- if (!map->is_written) {
- map->is_written = true;
- MarkRegionAsWritten(map->start, map->end - 1);
- }
- }
+ void WriteMemory(VAddr cpu_addr, u64 size);
- return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()};
- }
+ void CachedWriteMemory(VAddr cpu_addr, u64 size);
- /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
- BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
- std::size_t alignment = 4) {
- std::lock_guard lock{mutex};
- return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
- std::memcpy(dest, raw_pointer, size);
- });
- }
+ void DownloadMemory(VAddr cpu_addr, u64 size);
- /// Prepares the buffer cache for data uploading
- /// @param max_size Maximum number of bytes that will be uploaded
- /// @return True when a stream buffer invalidation was required, false otherwise
- void Map(std::size_t max_size) {
- std::lock_guard lock{mutex};
+ void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
- std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4);
- buffer_offset = buffer_offset_base;
- }
+ void UpdateGraphicsBuffers(bool is_indexed);
- /// Finishes the upload stream
- void Unmap() {
- std::lock_guard lock{mutex};
- stream_buffer.Unmap(buffer_offset - buffer_offset_base);
- }
+ void UpdateComputeBuffers();
- /// Function called at the end of each frame, inteded for deferred operations
- void TickFrame() {
- ++epoch;
+ void BindHostGeometryBuffers(bool is_indexed);
- while (!pending_destruction.empty()) {
- // Delay at least 4 frames before destruction.
- // This is due to triple buffering happening on some drivers.
- static constexpr u64 epochs_to_destroy = 5;
- if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
- break;
- }
- pending_destruction.pop();
- }
- }
+ void BindHostStageBuffers(size_t stage);
- /// Write any cached resources overlapping the specified region back to memory
- void FlushRegion(VAddr addr, std::size_t size) {
- std::lock_guard lock{mutex};
+ void BindHostComputeBuffers();
- VectorMapInterval objects = GetMapsInRange(addr, size);
- std::sort(objects.begin(), objects.end(),
- [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
- for (MapInterval* object : objects) {
- if (object->is_modified && object->is_registered) {
- mutex.unlock();
- FlushMap(object);
- mutex.lock();
- }
- }
- }
+ void SetEnabledUniformBuffers(size_t stage, u32 enabled);
- bool MustFlushRegion(VAddr addr, std::size_t size) {
- std::lock_guard lock{mutex};
+ void SetEnabledComputeUniformBuffers(u32 enabled);
- const VectorMapInterval objects = GetMapsInRange(addr, size);
- return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
- return map->is_modified && map->is_registered;
- });
- }
+ void UnbindGraphicsStorageBuffers(size_t stage);
- /// Mark the specified region as being invalidated
- void InvalidateRegion(VAddr addr, u64 size) {
- std::lock_guard lock{mutex};
+ void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
+ bool is_written);
- for (auto& object : GetMapsInRange(addr, size)) {
- if (object->is_registered) {
- Unregister(object);
- }
- }
- }
+ void UnbindComputeStorageBuffers();
- void OnCPUWrite(VAddr addr, std::size_t size) {
- std::lock_guard lock{mutex};
+ void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
+ bool is_written);
- for (MapInterval* object : GetMapsInRange(addr, size)) {
- if (object->is_memory_marked && object->is_registered) {
- UnmarkMemory(object);
- object->is_sync_pending = true;
- marked_for_unregister.emplace_back(object);
- }
- }
- }
+ void FlushCachedWrites();
- void SyncGuestHost() {
- std::lock_guard lock{mutex};
+ /// Return true when there are uncommitted buffers to be downloaded
+ [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
- for (auto& object : marked_for_unregister) {
- if (object->is_registered) {
- object->is_sync_pending = false;
- Unregister(object);
- }
+ /// Return true when the caller should wait for async downloads
+ [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
+
+ /// Commit asynchronous downloads
+ void CommitAsyncFlushes();
+
+ /// Pop asynchronous downloads
+ void PopAsyncFlushes();
+
+ /// Return true when a CPU region is modified from the GPU
+ [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
+
+ std::mutex mutex;
+
+private:
+ template <typename Func>
+ static void ForEachEnabledBit(u32 enabled_mask, Func&& func) {
+ for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) {
+ const int disabled_bits = std::countr_zero(enabled_mask);
+ index += disabled_bits;
+ enabled_mask >>= disabled_bits;
+ func(index);
}
- marked_for_unregister.clear();
}
- void CommitAsyncFlushes() {
- if (uncommitted_flushes) {
- auto commit_list = std::make_shared<std::list<MapInterval*>>();
- for (MapInterval* map : *uncommitted_flushes) {
- if (map->is_registered && map->is_modified) {
- // TODO(Blinkhawk): Implement backend asynchronous flushing
- // AsyncFlushMap(map)
- commit_list->push_back(map);
- }
- }
- if (!commit_list->empty()) {
- committed_flushes.push_back(commit_list);
- } else {
- committed_flushes.emplace_back();
+ template <typename Func>
+ void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
+ const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE);
+ for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) {
+ const BufferId buffer_id = page_table[page];
+ if (!buffer_id) {
+ ++page;
+ continue;
}
- } else {
- committed_flushes.emplace_back();
+ Buffer& buffer = slot_buffers[buffer_id];
+ func(buffer_id, buffer);
+
+ const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
+ page = Common::DivCeil(end_addr, PAGE_SIZE);
}
- uncommitted_flushes.reset();
}
- bool ShouldWaitAsyncFlushes() const {
- return !committed_flushes.empty() && committed_flushes.front() != nullptr;
+ static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
+ return (cpu_addr & ~Core::Memory::PAGE_MASK) ==
+ ((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
}
- bool HasUncommittedFlushes() const {
- return uncommitted_flushes != nullptr;
- }
+ void BindHostIndexBuffer();
- void PopAsyncFlushes() {
- if (committed_flushes.empty()) {
- return;
- }
- auto& flush_list = committed_flushes.front();
- if (!flush_list) {
- committed_flushes.pop_front();
- return;
- }
- for (MapInterval* map : *flush_list) {
- if (map->is_registered) {
- // TODO(Blinkhawk): Replace this for reading the asynchronous flush
- FlushMap(map);
- }
- }
- committed_flushes.pop_front();
- }
+ void BindHostVertexBuffers();
- virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
+ void BindHostGraphicsUniformBuffers(size_t stage);
-protected:
- explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
- Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
- StreamBuffer& stream_buffer_)
- : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
- stream_buffer{stream_buffer_} {}
+ void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
- ~BufferCache() = default;
+ void BindHostGraphicsStorageBuffers(size_t stage);
- virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
+ void BindHostTransformFeedbackBuffers();
- virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
- return {};
- }
+ void BindHostComputeUniformBuffers();
- /// Register an object into the cache
- MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
- const VAddr cpu_addr = new_map.start;
- if (!cpu_addr) {
- LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
- new_map.gpu_addr);
- return nullptr;
- }
- const std::size_t size = new_map.end - new_map.start;
- new_map.is_registered = true;
- rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
- new_map.is_memory_marked = true;
- if (inherit_written) {
- MarkRegionAsWritten(new_map.start, new_map.end - 1);
- new_map.is_written = true;
- }
- MapInterval* const storage = mapped_addresses_allocator.Allocate();
- *storage = new_map;
- mapped_addresses.insert(*storage);
- return storage;
- }
+ void BindHostComputeStorageBuffers();
- void UnmarkMemory(MapInterval* map) {
- if (!map->is_memory_marked) {
- return;
- }
- const std::size_t size = map->end - map->start;
- rasterizer.UpdatePagesCachedCount(map->start, size, -1);
- map->is_memory_marked = false;
- }
-
- /// Unregisters an object from the cache
- void Unregister(MapInterval* map) {
- UnmarkMemory(map);
- map->is_registered = false;
- if (map->is_sync_pending) {
- map->is_sync_pending = false;
- marked_for_unregister.remove(map);
+ void DoUpdateGraphicsBuffers(bool is_indexed);
+
+ void DoUpdateComputeBuffers();
+
+ void UpdateIndexBuffer();
+
+ void UpdateVertexBuffers();
+
+ void UpdateVertexBuffer(u32 index);
+
+ void UpdateUniformBuffers(size_t stage);
+
+ void UpdateStorageBuffers(size_t stage);
+
+ void UpdateTransformFeedbackBuffers();
+
+ void UpdateTransformFeedbackBuffer(u32 index);
+
+ void UpdateComputeUniformBuffers();
+
+ void UpdateComputeStorageBuffers();
+
+ void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
+
+ [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
+
+ [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size);
+
+ void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score);
+
+ [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
+
+ void Register(BufferId buffer_id);
+
+ void Unregister(BufferId buffer_id);
+
+ template <bool insert>
+ void ChangeRegister(BufferId buffer_id);
+
+ void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
+
+ void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
+
+ void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
+ std::span<BufferCopy> copies);
+
+ void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
+ std::span<const BufferCopy> copies);
+
+ void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies);
+
+ void DeleteBuffer(BufferId buffer_id);
+
+ void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
+
+ void NotifyBufferDeletion();
+
+ [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
+
+ [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
+
+ [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
+
+ [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
+
+ VideoCore::RasterizerInterface& rasterizer;
+ Tegra::Engines::Maxwell3D& maxwell3d;
+ Tegra::Engines::KeplerCompute& kepler_compute;
+ Tegra::MemoryManager& gpu_memory;
+ Core::Memory::Memory& cpu_memory;
+ Runtime& runtime;
+
+ SlotVector<Buffer> slot_buffers;
+ DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
+
+ u32 last_index_count = 0;
+
+ Binding index_buffer;
+ std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
+ std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
+ std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
+ std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
+
+ std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
+ std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
+
+ std::array<u32, NUM_STAGES> enabled_uniform_buffers{};
+ u32 enabled_compute_uniform_buffers = 0;
+
+ std::array<u32, NUM_STAGES> enabled_storage_buffers{};
+ std::array<u32, NUM_STAGES> written_storage_buffers{};
+ u32 enabled_compute_storage_buffers = 0;
+ u32 written_compute_storage_buffers = 0;
+
+ std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{};
+
+ bool has_deleted_buffers = false;
+
+ std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
+ dirty_uniform_buffers{};
+
+ std::vector<BufferId> cached_write_buffer_ids;
+
+ // TODO: This data structure is not optimal and it should be reworked
+ std::vector<BufferId> uncommitted_downloads;
+ std::deque<std::vector<BufferId>> committed_downloads;
+
+ size_t immediate_buffer_capacity = 0;
+ std::unique_ptr<u8[]> immediate_buffer_alloc;
+
+ std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
+};
+
+template <class P>
+BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
+ Tegra::Engines::Maxwell3D& maxwell3d_,
+ Tegra::Engines::KeplerCompute& kepler_compute_,
+ Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+ Runtime& runtime_)
+ : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_},
+ gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
+ // Ensure the first slot is used for the null buffer
+ void(slot_buffers.insert(runtime, NullBufferParams{}));
+}
+
+template <class P>
+void BufferCache<P>::TickFrame() {
+ delayed_destruction_ring.Tick();
+}
+
+template <class P>
+void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {
+ ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
+ buffer.MarkRegionAsCpuModified(cpu_addr, size);
+ });
+}
+
+template <class P>
+void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
+ ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
+ if (!buffer.HasCachedWrites()) {
+ cached_write_buffer_ids.push_back(buffer_id);
}
- if (map->is_written) {
- UnmarkRegionAsWritten(map->start, map->end - 1);
+ buffer.CachedCpuWrite(cpu_addr, size);
+ });
+}
+
+template <class P>
+void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
+ ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
+ boost::container::small_vector<BufferCopy, 1> copies;
+ u64 total_size_bytes = 0;
+ u64 largest_copy = 0;
+ buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+ copies.push_back(BufferCopy{
+ .src_offset = range_offset,
+ .dst_offset = total_size_bytes,
+ .size = range_size,
+ });
+ total_size_bytes += range_size;
+ largest_copy = std::max(largest_copy, range_size);
+ });
+ if (total_size_bytes == 0) {
+ return;
}
- const auto it = mapped_addresses.find(*map);
- ASSERT(it != mapped_addresses.end());
- mapped_addresses.erase(it);
- mapped_addresses_allocator.Release(map);
- }
-
-private:
- MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {
- const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
- if (overlaps.empty()) {
- const VAddr cpu_addr_end = cpu_addr + size;
- if (gpu_memory.IsGranularRange(gpu_addr, size)) {
- u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
- block->Upload(block->Offset(cpu_addr), size, host_ptr);
- } else {
- staging_buffer.resize(size);
- gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
- block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
+ MICROPROFILE_SCOPE(GPU_DownloadMemory);
+
+ if constexpr (USE_MEMORY_MAPS) {
+ auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
+ const u8* const mapped_memory = download_staging.mapped_span.data();
+ const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
+ for (BufferCopy& copy : copies) {
+ // Modify copies to have the staging offset in mind
+ copy.dst_offset += download_staging.offset;
}
- return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
- }
-
- const VAddr cpu_addr_end = cpu_addr + size;
- if (overlaps.size() == 1) {
- MapInterval* const current_map = overlaps[0];
- if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
- return current_map;
+ runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
+ runtime.Finish();
+ for (const BufferCopy& copy : copies) {
+ const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
+ // Undo the modified offset
+ const u64 dst_offset = copy.dst_offset - download_staging.offset;
+ const u8* copy_mapped_memory = mapped_memory + dst_offset;
+ cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
+ }
+ } else {
+ const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
+ for (const BufferCopy& copy : copies) {
+ buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
+ const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
+ cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
}
}
- VAddr new_start = cpu_addr;
- VAddr new_end = cpu_addr_end;
- bool write_inheritance = false;
- bool modified_inheritance = false;
- // Calculate new buffer parameters
- for (MapInterval* overlap : overlaps) {
- new_start = std::min(overlap->start, new_start);
- new_end = std::max(overlap->end, new_end);
- write_inheritance |= overlap->is_written;
- modified_inheritance |= overlap->is_modified;
+ });
+}
+
+template <class P>
+void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
+ u32 size) {
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+ if (!cpu_addr) {
+ uniform_buffers[stage][index] = NULL_BINDING;
+ return;
+ }
+ const Binding binding{
+ .cpu_addr = *cpu_addr,
+ .size = size,
+ .buffer_id = BufferId{},
+ };
+ uniform_buffers[stage][index] = binding;
+}
+
+template <class P>
+void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) {
+ MICROPROFILE_SCOPE(GPU_PrepareBuffers);
+ do {
+ has_deleted_buffers = false;
+ DoUpdateGraphicsBuffers(is_indexed);
+ } while (has_deleted_buffers);
+}
+
+template <class P>
+void BufferCache<P>::UpdateComputeBuffers() {
+ MICROPROFILE_SCOPE(GPU_PrepareBuffers);
+ do {
+ has_deleted_buffers = false;
+ DoUpdateComputeBuffers();
+ } while (has_deleted_buffers);
+}
+
+template <class P>
+void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
+ MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
+ if (is_indexed) {
+ BindHostIndexBuffer();
+ } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
+ const auto& regs = maxwell3d.regs;
+ if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
+ runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count);
}
- GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
- for (auto& overlap : overlaps) {
- Unregister(overlap);
+ }
+ BindHostVertexBuffers();
+ BindHostTransformFeedbackBuffers();
+}
+
+template <class P>
+void BufferCache<P>::BindHostStageBuffers(size_t stage) {
+ MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
+ BindHostGraphicsUniformBuffers(stage);
+ BindHostGraphicsStorageBuffers(stage);
+}
+
+template <class P>
+void BufferCache<P>::BindHostComputeBuffers() {
+ MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
+ BindHostComputeUniformBuffers();
+ BindHostComputeStorageBuffers();
+}
+
+template <class P>
+void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) {
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ if (enabled_uniform_buffers[stage] != enabled) {
+ dirty_uniform_buffers[stage] = ~u32{0};
}
- UpdateBlock(block, new_start, new_end, overlaps);
-
- const MapInterval new_map{new_start, new_end, new_gpu_addr};
- MapInterval* const map = Register(new_map, write_inheritance);
- if (!map) {
- return nullptr;
+ }
+ enabled_uniform_buffers[stage] = enabled;
+}
+
+template <class P>
+void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) {
+ enabled_compute_uniform_buffers = enabled;
+}
+
+template <class P>
+void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) {
+ enabled_storage_buffers[stage] = 0;
+ written_storage_buffers[stage] = 0;
+}
+
+template <class P>
+void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index,
+ u32 cbuf_offset, bool is_written) {
+ enabled_storage_buffers[stage] |= 1U << ssbo_index;
+ written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index;
+
+ const auto& cbufs = maxwell3d.state.shader_stages[stage];
+ const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
+ storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr);
+}
+
+template <class P>
+void BufferCache<P>::UnbindComputeStorageBuffers() {
+ enabled_compute_storage_buffers = 0;
+ written_compute_storage_buffers = 0;
+}
+
+template <class P>
+void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
+ bool is_written) {
+ enabled_compute_storage_buffers |= 1U << ssbo_index;
+ written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index;
+
+ const auto& launch_desc = kepler_compute.launch_description;
+ ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0);
+
+ const auto& cbufs = launch_desc.const_buffer_config;
+ const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
+ compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr);
+}
+
+template <class P>
+void BufferCache<P>::FlushCachedWrites() {
+ for (const BufferId buffer_id : cached_write_buffer_ids) {
+ slot_buffers[buffer_id].FlushCachedWrites();
+ }
+ cached_write_buffer_ids.clear();
+}
+
+template <class P>
+bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
+ return !uncommitted_downloads.empty();
+}
+
+template <class P>
+bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
+ return !committed_downloads.empty() && !committed_downloads.front().empty();
+}
+
+template <class P>
+void BufferCache<P>::CommitAsyncFlushes() {
+ // This is intentionally passing the value by copy
+ committed_downloads.push_front(uncommitted_downloads);
+ uncommitted_downloads.clear();
+}
+
+template <class P>
+void BufferCache<P>::PopAsyncFlushes() {
+ if (committed_downloads.empty()) {
+ return;
+ }
+ auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
+ const std::span<const BufferId> download_ids = committed_downloads.back();
+ if (download_ids.empty()) {
+ return;
+ }
+ MICROPROFILE_SCOPE(GPU_DownloadMemory);
+
+ boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
+ u64 total_size_bytes = 0;
+ u64 largest_copy = 0;
+ for (const BufferId buffer_id : download_ids) {
+ slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) {
+ downloads.push_back({
+ BufferCopy{
+ .src_offset = range_offset,
+ .dst_offset = total_size_bytes,
+ .size = range_size,
+ },
+ buffer_id,
+ });
+ total_size_bytes += range_size;
+ largest_copy = std::max(largest_copy, range_size);
+ });
+ }
+ if (downloads.empty()) {
+ return;
+ }
+ if constexpr (USE_MEMORY_MAPS) {
+ auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
+ for (auto& [copy, buffer_id] : downloads) {
+ // Have in mind the staging buffer offset for the copy
+ copy.dst_offset += download_staging.offset;
+ const std::array copies{copy};
+ runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
}
- if (modified_inheritance) {
- map->MarkAsModified(true, GetModifiedTicks());
- if (Settings::IsGPULevelHigh() &&
- Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
- MarkForAsyncFlush(map);
- }
+ runtime.Finish();
+ for (const auto [copy, buffer_id] : downloads) {
+ const Buffer& buffer = slot_buffers[buffer_id];
+ const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
+ // Undo the modified offset
+ const u64 dst_offset = copy.dst_offset - download_staging.offset;
+ const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset;
+ cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
+ }
+ } else {
+ const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
+ for (const auto [copy, buffer_id] : downloads) {
+ Buffer& buffer = slot_buffers[buffer_id];
+ buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
+ const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
+ cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
}
- return map;
}
-
- void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) {
- const IntervalType base_interval{start, end};
- IntervalSet interval_set{};
- interval_set.add(base_interval);
- for (auto& overlap : overlaps) {
- const IntervalType subtract{overlap->start, overlap->end};
- interval_set.subtract(subtract);
+}
+
+template <class P>
+bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
+ const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
+ for (u64 page = addr >> PAGE_BITS; page < page_end;) {
+ const BufferId image_id = page_table[page];
+ if (!image_id) {
+ ++page;
+ continue;
}
- for (auto& interval : interval_set) {
- const std::size_t size = interval.upper() - interval.lower();
- if (size == 0) {
- continue;
- }
- staging_buffer.resize(size);
- cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
- block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
+ Buffer& buffer = slot_buffers[image_id];
+ if (buffer.IsRegionGpuModified(addr, size)) {
+ return true;
}
+ const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
+ page = Common::DivCeil(end_addr, PAGE_SIZE);
}
-
- VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
- VectorMapInterval result;
- if (size == 0) {
- return result;
+ return false;
+}
+
+template <class P>
+void BufferCache<P>::BindHostIndexBuffer() {
+ Buffer& buffer = slot_buffers[index_buffer.buffer_id];
+ const u32 offset = buffer.Offset(index_buffer.cpu_addr);
+ const u32 size = index_buffer.size;
+ SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
+ if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
+ runtime.BindIndexBuffer(buffer, offset, size);
+ } else {
+ runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format,
+ maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count,
+ buffer, offset, size);
+ }
+}
+
+template <class P>
+void BufferCache<P>::BindHostVertexBuffers() {
+ auto& flags = maxwell3d.dirty.flags;
+ for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
+ const Binding& binding = vertex_buffers[index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
+ if (!flags[Dirty::VertexBuffer0 + index]) {
+ continue;
}
+ flags[Dirty::VertexBuffer0 + index] = false;
- const VAddr addr_end = addr + size;
- auto it = mapped_addresses.lower_bound(addr);
- if (it != mapped_addresses.begin()) {
- --it;
+ const u32 stride = maxwell3d.regs.vertex_array[index].stride;
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride);
+ }
+}
+
+template <class P>
+void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
+ u32 dirty = ~0U;
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ dirty = std::exchange(dirty_uniform_buffers[stage], 0);
+ }
+ u32 binding_index = 0;
+ ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
+ const bool needs_bind = ((dirty >> index) & 1) != 0;
+ BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind);
+ if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
+ ++binding_index;
}
- while (it != mapped_addresses.end() && it->start < addr_end) {
- if (it->Overlaps(addr, addr_end)) {
- result.push_back(&*it);
+ });
+}
+
+template <class P>
+void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index,
+ bool needs_bind) {
+ const Binding& binding = uniform_buffers[stage][index];
+ const VAddr cpu_addr = binding.cpu_addr;
+ const u32 size = binding.size;
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) {
+ if constexpr (IS_OPENGL) {
+ if (runtime.HasFastBufferSubData()) {
+ // Fast path for Nvidia
+ if (!HasFastUniformBufferBound(stage, binding_index)) {
+ // We only have to bind when the currently bound buffer is not the fast version
+ runtime.BindFastUniformBuffer(stage, binding_index, size);
+ }
+ const auto span = ImmediateBufferWithData(cpu_addr, size);
+ runtime.PushFastUniformBuffer(stage, binding_index, span);
+ return;
}
- ++it;
}
- return result;
- }
+ fast_bound_uniform_buffers[stage] |= 1U << binding_index;
- /// Returns a ticks counter used for tracking when cached objects were last modified
- u64 GetModifiedTicks() {
- return ++modified_ticks;
+ // Stream buffer path to avoid stalling on non-Nvidia drivers or Vulkan
+ const std::span<u8> span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
+ cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
+ return;
}
-
- void FlushMap(MapInterval* map) {
- const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
- ASSERT_OR_EXECUTE(it != blocks.end(), return;);
-
- std::shared_ptr<Buffer> block = it->second;
-
- const std::size_t size = map->end - map->start;
- staging_buffer.resize(size);
- block->Download(block->Offset(map->start), size, staging_buffer.data());
- cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
- map->MarkAsModified(false, 0);
+ // Classic cached path
+ SynchronizeBuffer(buffer, cpu_addr, size);
+ if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) {
+ // Skip binding if it's not needed and if the bound buffer is not the fast version
+ // This exists to avoid instances where the fast buffer is bound and a GPU write happens
+ return;
}
+ fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
- template <typename Callable>
- BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
- AlignBuffer(alignment);
- const std::size_t uploaded_offset = buffer_offset;
- callable(buffer_ptr);
-
- buffer_ptr += size;
- buffer_offset += size;
- return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
+ const u32 offset = buffer.Offset(cpu_addr);
+ if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
+ runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
+ } else {
+ runtime.BindUniformBuffer(buffer, offset, size);
}
+}
+
+template <class P>
+void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
+ u32 binding_index = 0;
+ ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
+ const Binding& binding = storage_buffers[stage][index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ const u32 size = binding.size;
+ SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0;
+ if constexpr (NEEDS_BIND_STORAGE_INDEX) {
+ runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written);
+ ++binding_index;
+ } else {
+ runtime.BindStorageBuffer(buffer, offset, size, is_written);
+ }
+ });
+}
- void AlignBuffer(std::size_t alignment) {
- // Align the offset, not the mapped pointer
- const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
- buffer_ptr += offset_aligned - buffer_offset;
- buffer_offset = offset_aligned;
+template <class P>
+void BufferCache<P>::BindHostTransformFeedbackBuffers() {
+ if (maxwell3d.regs.tfb_enabled == 0) {
+ return;
}
+ for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
+ const Binding& binding = transform_feedback_buffers[index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ const u32 size = binding.size;
+ SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ runtime.BindTransformFeedbackBuffer(index, buffer, offset, size);
+ }
+}
- std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
- const std::size_t old_size = buffer->Size();
- const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
- const VAddr cpu_addr = buffer->CpuAddr();
- std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
- new_buffer->CopyFrom(*buffer, 0, 0, old_size);
- QueueDestruction(std::move(buffer));
-
- const VAddr cpu_addr_end = cpu_addr + new_size - 1;
- const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
- for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
- blocks.insert_or_assign(page_start, new_buffer);
+template <class P>
+void BufferCache<P>::BindHostComputeUniformBuffers() {
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ // Mark all uniform buffers as dirty
+ dirty_uniform_buffers.fill(~u32{0});
+ }
+ u32 binding_index = 0;
+ ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
+ const Binding& binding = compute_uniform_buffers[index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ const u32 size = binding.size;
+ SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
+ runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size);
+ ++binding_index;
+ } else {
+ runtime.BindUniformBuffer(buffer, offset, size);
}
+ });
+}
+
+template <class P>
+void BufferCache<P>::BindHostComputeStorageBuffers() {
+ u32 binding_index = 0;
+ ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
+ const Binding& binding = compute_storage_buffers[index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ const u32 size = binding.size;
+ SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0;
+ if constexpr (NEEDS_BIND_STORAGE_INDEX) {
+ runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written);
+ ++binding_index;
+ } else {
+ runtime.BindStorageBuffer(buffer, offset, size, is_written);
+ }
+ });
+}
- return new_buffer;
+template <class P>
+void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
+ if (is_indexed) {
+ UpdateIndexBuffer();
}
+ UpdateVertexBuffers();
+ UpdateTransformFeedbackBuffers();
+ for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
+ UpdateUniformBuffers(stage);
+ UpdateStorageBuffers(stage);
+ }
+}
+
+template <class P>
+void BufferCache<P>::DoUpdateComputeBuffers() {
+ UpdateComputeUniformBuffers();
+ UpdateComputeStorageBuffers();
+}
+
+template <class P>
+void BufferCache<P>::UpdateIndexBuffer() {
+ // We have to check for the dirty flags and index count
+ // The index count is currently changed without updating the dirty flags
+ const auto& index_array = maxwell3d.regs.index_array;
+ auto& flags = maxwell3d.dirty.flags;
+ if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) {
+ return;
+ }
+ flags[Dirty::IndexBuffer] = false;
+ last_index_count = index_array.count;
+
+ const GPUVAddr gpu_addr_begin = index_array.StartAddress();
+ const GPUVAddr gpu_addr_end = index_array.EndAddress();
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
+ const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
+ const u32 draw_size = index_array.count * index_array.FormatSizeInBytes();
+ const u32 size = std::min(address_size, draw_size);
+ if (size == 0 || !cpu_addr) {
+ index_buffer = NULL_BINDING;
+ return;
+ }
+ index_buffer = Binding{
+ .cpu_addr = *cpu_addr,
+ .size = size,
+ .buffer_id = FindBuffer(*cpu_addr, size),
+ };
+}
- std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
- std::shared_ptr<Buffer> second) {
- const std::size_t size_1 = first->Size();
- const std::size_t size_2 = second->Size();
- const VAddr first_addr = first->CpuAddr();
- const VAddr second_addr = second->CpuAddr();
- const VAddr new_addr = std::min(first_addr, second_addr);
- const std::size_t new_size = size_1 + size_2;
-
- std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
- new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
- new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
- QueueDestruction(std::move(first));
- QueueDestruction(std::move(second));
+template <class P>
+void BufferCache<P>::UpdateVertexBuffers() {
+ auto& flags = maxwell3d.dirty.flags;
+ if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) {
+ return;
+ }
+ flags[Dirty::VertexBuffers] = false;
- const VAddr cpu_addr_end = new_addr + new_size - 1;
- const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
- for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
- blocks.insert_or_assign(page_start, new_buffer);
- }
- return new_buffer;
+ for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
+ UpdateVertexBuffer(index);
}
+}
- Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
- std::shared_ptr<Buffer> found;
+template <class P>
+void BufferCache<P>::UpdateVertexBuffer(u32 index) {
+ if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) {
+ return;
+ }
+ const auto& array = maxwell3d.regs.vertex_array[index];
+ const auto& limit = maxwell3d.regs.vertex_array_limit[index];
+ const GPUVAddr gpu_addr_begin = array.StartAddress();
+ const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1;
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
+ const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
+ const u32 size = address_size; // TODO: Analyze stride and number of vertices
+ if (array.enable == 0 || size == 0 || !cpu_addr) {
+ vertex_buffers[index] = NULL_BINDING;
+ return;
+ }
+ vertex_buffers[index] = Binding{
+ .cpu_addr = *cpu_addr,
+ .size = size,
+ .buffer_id = FindBuffer(*cpu_addr, size),
+ };
+}
+
+template <class P>
+void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
+ ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
+ Binding& binding = uniform_buffers[stage][index];
+ if (binding.buffer_id) {
+ // Already updated
+ return;
+ }
+ // Mark as dirty
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ dirty_uniform_buffers[stage] |= 1U << index;
+ }
+ // Resolve buffer
+ binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+ });
+}
+
+template <class P>
+void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
+ const u32 written_mask = written_storage_buffers[stage];
+ ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
+ // Resolve buffer
+ Binding& binding = storage_buffers[stage][index];
+ const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+ binding.buffer_id = buffer_id;
+ // Mark buffer as written if needed
+ if (((written_mask >> index) & 1) != 0) {
+ MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
+ }
+ });
+}
- const VAddr cpu_addr_end = cpu_addr + size - 1;
- const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
- for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
- auto it = blocks.find(page_start);
- if (it == blocks.end()) {
- if (found) {
- found = EnlargeBlock(found);
- continue;
- }
- const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
- found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
- blocks.insert_or_assign(page_start, found);
- continue;
- }
- if (!found) {
- found = it->second;
- continue;
- }
- if (found != it->second) {
- found = MergeBlocks(std::move(found), it->second);
+template <class P>
+void BufferCache<P>::UpdateTransformFeedbackBuffers() {
+ if (maxwell3d.regs.tfb_enabled == 0) {
+ return;
+ }
+ for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
+ UpdateTransformFeedbackBuffer(index);
+ }
+}
+
+template <class P>
+void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
+ const auto& binding = maxwell3d.regs.tfb_bindings[index];
+ const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset;
+ const u32 size = binding.buffer_size;
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+ if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) {
+ transform_feedback_buffers[index] = NULL_BINDING;
+ return;
+ }
+ const BufferId buffer_id = FindBuffer(*cpu_addr, size);
+ transform_feedback_buffers[index] = Binding{
+ .cpu_addr = *cpu_addr,
+ .size = size,
+ .buffer_id = buffer_id,
+ };
+ MarkWrittenBuffer(buffer_id, *cpu_addr, size);
+}
+
+template <class P>
+void BufferCache<P>::UpdateComputeUniformBuffers() {
+ ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
+ Binding& binding = compute_uniform_buffers[index];
+ binding = NULL_BINDING;
+ const auto& launch_desc = kepler_compute.launch_description;
+ if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) {
+ const auto& cbuf = launch_desc.const_buffer_config[index];
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address());
+ if (cpu_addr) {
+ binding.cpu_addr = *cpu_addr;
+ binding.size = cbuf.size;
}
}
- return found.get();
+ binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+ });
+}
+
+template <class P>
+void BufferCache<P>::UpdateComputeStorageBuffers() {
+ ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
+ // Resolve buffer
+ Binding& binding = compute_storage_buffers[index];
+ const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+ binding.buffer_id = buffer_id;
+ // Mark as written if needed
+ if (((written_compute_storage_buffers >> index) & 1) != 0) {
+ MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
+ }
+ });
+}
+
+template <class P>
+void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) {
+ Buffer& buffer = slot_buffers[buffer_id];
+ buffer.MarkRegionAsGpuModified(cpu_addr, size);
+
+ const bool is_accuracy_high = Settings::IsGPULevelHigh();
+ const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
+ if (!is_accuracy_high || !is_async) {
+ return;
+ }
+ if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
+ // Already inserted
+ return;
}
+ uncommitted_downloads.push_back(buffer_id);
+}
- void MarkRegionAsWritten(VAddr start, VAddr end) {
- const u64 page_end = end >> WRITE_PAGE_BIT;
- for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
- if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) {
- ++it->second;
- }
+template <class P>
+BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) {
+ if (cpu_addr == 0) {
+ return NULL_BUFFER_ID;
+ }
+ const u64 page = cpu_addr >> PAGE_BITS;
+ const BufferId buffer_id = page_table[page];
+ if (!buffer_id) {
+ return CreateBuffer(cpu_addr, size);
+ }
+ const Buffer& buffer = slot_buffers[buffer_id];
+ if (buffer.IsInBounds(cpu_addr, size)) {
+ return buffer_id;
+ }
+ return CreateBuffer(cpu_addr, size);
+}
+
+template <class P>
+typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu_addr,
+ u32 wanted_size) {
+ static constexpr int STREAM_LEAP_THRESHOLD = 16;
+ std::vector<BufferId> overlap_ids;
+ VAddr begin = cpu_addr;
+ VAddr end = cpu_addr + wanted_size;
+ int stream_score = 0;
+ bool has_stream_leap = false;
+ for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) {
+ const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS];
+ if (!overlap_id) {
+ continue;
+ }
+ Buffer& overlap = slot_buffers[overlap_id];
+ if (overlap.IsPicked()) {
+ continue;
+ }
+ overlap_ids.push_back(overlap_id);
+ overlap.Pick();
+ const VAddr overlap_cpu_addr = overlap.CpuAddr();
+ if (overlap_cpu_addr < begin) {
+ cpu_addr = begin = overlap_cpu_addr;
+ }
+ end = std::max(end, overlap_cpu_addr + overlap.SizeBytes());
+
+ stream_score += overlap.StreamScore();
+ if (stream_score > STREAM_LEAP_THRESHOLD && !has_stream_leap) {
+ // When this memory region has been joined a bunch of times, we assume it's being used
+ // as a stream buffer. Increase the size to skip constantly recreating buffers.
+ has_stream_leap = true;
+ end += PAGE_SIZE * 256;
}
}
-
- void UnmarkRegionAsWritten(VAddr start, VAddr end) {
- const u64 page_end = end >> WRITE_PAGE_BIT;
- for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
- auto it = written_pages.find(page_start);
- if (it != written_pages.end()) {
- if (it->second > 1) {
- --it->second;
- } else {
- written_pages.erase(it);
- }
- }
+ return OverlapResult{
+ .ids = std::move(overlap_ids),
+ .begin = begin,
+ .end = end,
+ .has_stream_leap = has_stream_leap,
+ };
+}
+
+template <class P>
+void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
+ bool accumulate_stream_score) {
+ Buffer& new_buffer = slot_buffers[new_buffer_id];
+ Buffer& overlap = slot_buffers[overlap_id];
+ if (accumulate_stream_score) {
+ new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1);
+ }
+ std::vector<BufferCopy> copies;
+ const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
+ overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) {
+ copies.push_back(BufferCopy{
+ .src_offset = begin,
+ .dst_offset = dst_base_offset + begin,
+ .size = range_size,
+ });
+ new_buffer.UnmarkRegionAsCpuModified(begin, range_size);
+ new_buffer.MarkRegionAsGpuModified(begin, range_size);
+ });
+ if (!copies.empty()) {
+ runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
+ }
+ ReplaceBufferDownloads(overlap_id, new_buffer_id);
+ DeleteBuffer(overlap_id);
+}
+
+template <class P>
+BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
+ const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size);
+ const u32 size = static_cast<u32>(overlap.end - overlap.begin);
+ const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size);
+ for (const BufferId overlap_id : overlap.ids) {
+ JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
+ }
+ Register(new_buffer_id);
+ return new_buffer_id;
+}
+
+template <class P>
+void BufferCache<P>::Register(BufferId buffer_id) {
+ ChangeRegister<true>(buffer_id);
+}
+
+template <class P>
+void BufferCache<P>::Unregister(BufferId buffer_id) {
+ ChangeRegister<false>(buffer_id);
+}
+
+template <class P>
+template <bool insert>
+void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
+ const Buffer& buffer = slot_buffers[buffer_id];
+ const VAddr cpu_addr_begin = buffer.CpuAddr();
+ const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes();
+ const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
+ const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
+ for (u64 page = page_begin; page != page_end; ++page) {
+ if constexpr (insert) {
+ page_table[page] = buffer_id;
+ } else {
+ page_table[page] = BufferId{};
}
}
+}
- bool IsRegionWritten(VAddr start, VAddr end) const {
- const u64 page_end = end >> WRITE_PAGE_BIT;
- for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
- if (written_pages.contains(page_start)) {
- return true;
+template <class P>
+void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
+ if (buffer.CpuAddr() == 0) {
+ return;
+ }
+ SynchronizeBufferImpl(buffer, cpu_addr, size);
+}
+
+template <class P>
+void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) {
+ boost::container::small_vector<BufferCopy, 4> copies;
+ u64 total_size_bytes = 0;
+ u64 largest_copy = 0;
+ buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+ copies.push_back(BufferCopy{
+ .src_offset = total_size_bytes,
+ .dst_offset = range_offset,
+ .size = range_size,
+ });
+ total_size_bytes += range_size;
+ largest_copy = std::max(largest_copy, range_size);
+ });
+ if (total_size_bytes == 0) {
+ return;
+ }
+ const std::span<BufferCopy> copies_span(copies.data(), copies.size());
+ UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
+}
+
+template <class P>
+void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
+ std::span<BufferCopy> copies) {
+ if constexpr (USE_MEMORY_MAPS) {
+ MappedUploadMemory(buffer, total_size_bytes, copies);
+ } else {
+ ImmediateUploadMemory(buffer, largest_copy, copies);
+ }
+}
+
+template <class P>
+void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
+ std::span<const BufferCopy> copies) {
+ std::span<u8> immediate_buffer;
+ for (const BufferCopy& copy : copies) {
+ std::span<const u8> upload_span;
+ const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
+ if (IsRangeGranular(cpu_addr, copy.size)) {
+ upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size);
+ } else {
+ if (immediate_buffer.empty()) {
+ immediate_buffer = ImmediateBuffer(largest_copy);
}
+ cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
+ upload_span = immediate_buffer.subspan(0, copy.size);
}
- return false;
+ buffer.ImmediateUpload(copy.dst_offset, upload_span);
}
-
- void QueueDestruction(std::shared_ptr<Buffer> buffer) {
- buffer->SetEpoch(epoch);
- pending_destruction.push(std::move(buffer));
+}
+
+template <class P>
+void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
+ std::span<BufferCopy> copies) {
+ auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
+ const std::span<u8> staging_pointer = upload_staging.mapped_span;
+ for (BufferCopy& copy : copies) {
+ u8* const src_pointer = staging_pointer.data() + copy.src_offset;
+ const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
+ cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
+
+ // Apply the staging offset
+ copy.src_offset += upload_staging.offset;
}
-
- void MarkForAsyncFlush(MapInterval* map) {
- if (!uncommitted_flushes) {
- uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
+ runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
+}
+
+template <class P>
+void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
+ const auto scalar_replace = [buffer_id](Binding& binding) {
+ if (binding.buffer_id == buffer_id) {
+ binding.buffer_id = BufferId{};
+ }
+ };
+ const auto replace = [scalar_replace](std::span<Binding> bindings) {
+ std::ranges::for_each(bindings, scalar_replace);
+ };
+ scalar_replace(index_buffer);
+ replace(vertex_buffers);
+ std::ranges::for_each(uniform_buffers, replace);
+ std::ranges::for_each(storage_buffers, replace);
+ replace(transform_feedback_buffers);
+ replace(compute_uniform_buffers);
+ replace(compute_storage_buffers);
+ std::erase(cached_write_buffer_ids, buffer_id);
+
+ // Mark the whole buffer as CPU written to stop tracking CPU writes
+ Buffer& buffer = slot_buffers[buffer_id];
+ buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
+
+ Unregister(buffer_id);
+ delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
+
+ NotifyBufferDeletion();
+}
+
+template <class P>
+void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
+ const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
+ std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
+ if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
+ buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
}
- uncommitted_flushes->insert(map);
+ };
+ replace(uncommitted_downloads);
+ std::ranges::for_each(committed_downloads, replace);
+}
+
+template <class P>
+void BufferCache<P>::NotifyBufferDeletion() {
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ dirty_uniform_buffers.fill(~u32{0});
}
+ auto& flags = maxwell3d.dirty.flags;
+ flags[Dirty::IndexBuffer] = true;
+ flags[Dirty::VertexBuffers] = true;
+ for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
+ flags[Dirty::VertexBuffer0 + index] = true;
+ }
+ has_deleted_buffers = true;
+}
+
+template <class P>
+typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const {
+ const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr);
+ const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8);
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+ if (!cpu_addr || size == 0) {
+ return NULL_BINDING;
+ }
+ // HACK(Rodrigo): This is the number of bytes bound in host beyond the guest API's range.
+ // It exists due to some games like Astral Chain operate out of bounds.
+ // Binding the whole map range would be technically correct, but games have large maps that make
+ // this approach unaffordable for now.
+ static constexpr u32 arbitrary_extra_bytes = 0xc000;
+ const u32 bytes_to_map_end = static_cast<u32>(gpu_memory.BytesToMapEnd(gpu_addr));
+ const Binding binding{
+ .cpu_addr = *cpu_addr,
+ .size = std::min(size + arbitrary_extra_bytes, bytes_to_map_end),
+ .buffer_id = BufferId{},
+ };
+ return binding;
+}
+
+template <class P>
+std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) {
+ u8* const base_pointer = cpu_memory.GetPointer(cpu_addr);
+ if (IsRangeGranular(cpu_addr, size) ||
+ base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) {
+ return std::span(base_pointer, size);
+ } else {
+ const std::span<u8> span = ImmediateBuffer(size);
+ cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
+ return span;
+ }
+}
- VideoCore::RasterizerInterface& rasterizer;
- Tegra::MemoryManager& gpu_memory;
- Core::Memory::Memory& cpu_memory;
- StreamBuffer& stream_buffer;
-
- u8* buffer_ptr = nullptr;
- u64 buffer_offset = 0;
- u64 buffer_offset_base = 0;
-
- MapIntervalAllocator mapped_addresses_allocator;
- boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
- mapped_addresses;
-
- std::unordered_map<u64, u32> written_pages;
- std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
-
- std::queue<std::shared_ptr<Buffer>> pending_destruction;
- u64 epoch = 0;
- u64 modified_ticks = 0;
-
- std::vector<u8> staging_buffer;
-
- std::list<MapInterval*> marked_for_unregister;
-
- std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
- std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
-
- std::recursive_mutex mutex;
-};
+template <class P>
+std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
+ if (wanted_capacity > immediate_buffer_capacity) {
+ immediate_buffer_capacity = wanted_capacity;
+ immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity);
+ }
+ return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity);
+}
+
+template <class P>
+bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
+ if constexpr (IS_OPENGL) {
+ return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
+ } else {
+ // Only OpenGL has fast uniform buffers
+ return false;
+ }
+}
} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
deleted file mode 100644
index 62587e18a..000000000
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2020 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <memory>
-
-#include "video_core/buffer_cache/map_interval.h"
-
-namespace VideoCommon {
-
-MapIntervalAllocator::MapIntervalAllocator() {
- FillFreeList(first_chunk);
-}
-
-MapIntervalAllocator::~MapIntervalAllocator() = default;
-
-void MapIntervalAllocator::AllocateNewChunk() {
- *new_chunk = std::make_unique<Chunk>();
- FillFreeList(**new_chunk);
- new_chunk = &(*new_chunk)->next;
-}
-
-void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
- const std::size_t old_size = free_list.size();
- free_list.resize(old_size + chunk.data.size());
- std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
- [](MapInterval& interval) { return &interval; });
-}
-
-} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
deleted file mode 100644
index ef974b08a..000000000
--- a/src/video_core/buffer_cache/map_interval.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <array>
-#include <cstddef>
-#include <memory>
-#include <vector>
-
-#include <boost/intrusive/set_hook.hpp>
-
-#include "common/common_types.h"
-#include "video_core/gpu.h"
-
-namespace VideoCommon {
-
-struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
- MapInterval() = default;
-
- /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
-
- explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
- : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
-
- bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
- return start <= other_start && other_end <= end;
- }
-
- bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
- return start < other_end && other_start < end;
- }
-
- void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
- is_modified = is_modified_;
- ticks = ticks_;
- }
-
- boost::intrusive::set_member_hook<> member_hook_;
- VAddr start = 0;
- VAddr end = 0;
- GPUVAddr gpu_addr = 0;
- u64 ticks = 0;
- bool is_written = false;
- bool is_modified = false;
- bool is_registered = false;
- bool is_memory_marked = false;
- bool is_sync_pending = false;
-};
-
-struct MapIntervalCompare {
- constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
- return lhs.start < rhs.start;
- }
-};
-
-class MapIntervalAllocator {
-public:
- MapIntervalAllocator();
- ~MapIntervalAllocator();
-
- MapInterval* Allocate() {
- if (free_list.empty()) {
- AllocateNewChunk();
- }
- MapInterval* const interval = free_list.back();
- free_list.pop_back();
- return interval;
- }
-
- void Release(MapInterval* interval) {
- free_list.push_back(interval);
- }
-
-private:
- struct Chunk {
- std::unique_ptr<Chunk> next;
- std::array<MapInterval, 0x8000> data;
- };
-
- void AllocateNewChunk();
-
- void FillFreeList(Chunk& chunk);
-
- std::vector<MapInterval*> free_list;
-
- Chunk first_chunk;
-
- std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
-};
-
-} // namespace VideoCommon