From ed4553806a08e4130fcea36230985cb74d1b326a Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 22 Apr 2023 20:10:40 +0200 Subject: Implement Async downloads in normal and fix a few issues. --- src/video_core/buffer_cache/buffer_cache.h | 69 +++++++++++++------------ src/video_core/buffer_cache/buffer_cache_base.h | 25 +++++++-- src/video_core/buffer_cache/word_manager.h | 6 ++- 3 files changed, 61 insertions(+), 39 deletions(-) (limited to 'src') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 43fe5b080..faa48a678 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -22,6 +22,8 @@ BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, void(slot_buffers.insert(runtime, NullBufferParams{})); common_ranges.clear(); + active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); + if (!runtime.CanReportMemoryUsage()) { minimum_memory = DEFAULT_EXPECTED_MEMORY; critical_memory = DEFAULT_CRITICAL_MEMORY; @@ -72,6 +74,8 @@ void BufferCache

::TickFrame() { uniform_cache_hits[0] = 0; uniform_cache_shots[0] = 0; + active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); + const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; @@ -130,7 +134,7 @@ void BufferCache

::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) { template void BufferCache

::ClearDownload(IntervalType subtract_interval) { - async_downloads -= std::make_pair(subtract_interval, std::numeric_limits::max()); + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1024); uncommitted_ranges.subtract(subtract_interval); pending_ranges.subtract(subtract_interval); for (auto& interval_set : committed_ranges) { @@ -173,18 +177,14 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am }}; boost::container::small_vector tmp_intervals; - const bool is_high_accuracy = - Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; auto mirror = [&](VAddr base_address, VAddr base_address_end) { const u64 size = base_address_end - base_address; const VAddr diff = base_address - *cpu_src_address; const VAddr new_base_address = *cpu_dest_address + diff; const IntervalType add_interval{new_base_address, new_base_address + size}; tmp_intervals.push_back(add_interval); - if (is_high_accuracy) { - uncommitted_ranges.add(add_interval); - pending_ranges.add(add_interval); - } + uncommitted_ranges.add(add_interval); + pending_ranges.add(add_interval); }; ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. @@ -468,7 +468,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); if (committed_ranges.empty()) { - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { async_buffers.emplace_back(std::optional{}); } return; @@ -529,31 +529,33 @@ void BufferCache

::CommitAsyncFlushesHigh() { } committed_ranges.clear(); if (downloads.empty()) { - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { async_buffers.emplace_back(std::optional{}); } return; } - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); - boost::container::small_vector normalized_copies; - IntervalSet new_async_range{}; - runtime.PreCopyBarrier(); - for (auto& [copy, buffer_id] : downloads) { - copy.dst_offset += download_staging.offset; - const std::array copies{copy}; - BufferCopy second_copy{copy}; - Buffer& buffer = slot_buffers[buffer_id]; - second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; - VAddr orig_cpu_addr = static_cast(second_copy.src_offset); - const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; - async_downloads += std::make_pair(base_interval, 1); - runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); - normalized_copies.push_back(second_copy); + if (active_async_buffers) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); + boost::container::small_vector normalized_copies; + IntervalSet new_async_range{}; + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + BufferCopy second_copy{copy}; + Buffer& buffer = slot_buffers[buffer_id]; + second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; + VAddr orig_cpu_addr = static_cast(second_copy.src_offset); + const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; + async_downloads += std::make_pair(base_interval, 1); + runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); + normalized_copies.push_back(second_copy); + } + runtime.PostCopyBarrier(); + pending_downloads.emplace_back(std::move(normalized_copies)); + async_buffers.emplace_back(download_staging); } - runtime.PostCopyBarrier(); - pending_downloads.emplace_back(std::move(normalized_copies)); - async_buffers.emplace_back(download_staging); } else { if constexpr (USE_MEMORY_MAPS) { auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); @@ -624,7 +626,8 @@ void BufferCache

::PopAsyncBuffers() { common_ranges.subtract(base_interval); } }); - async_downloads -= std::make_pair(IntervalType(cpu_addr, cpu_addr + copy.size), 1); + const IntervalType subtract_interval{cpu_addr, cpu_addr + copy.size}; + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1); } runtime.FreeDeferredStagingBuffer(*async_buffer); async_buffers.pop_front(); @@ -1198,10 +1201,8 @@ void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s const IntervalType base_interval{cpu_addr, cpu_addr + size}; common_ranges.add(base_interval); - if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { - uncommitted_ranges.add(base_interval); - pending_ranges.add(base_interval); - } + uncommitted_ranges.add(base_interval); + pending_ranges.add(base_interval); } template @@ -1542,7 +1543,7 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si .size = new_size, }); // Align up to avoid cache conflicts - constexpr u64 align = 8ULL; + constexpr u64 align = 64ULL; constexpr u64 mask = ~(align - 1ULL); total_size_bytes += (new_size + align - 1) & mask; largest_copy = std::max(largest_copy, new_size); diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 6f29cba25..d4914a8f5 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -345,13 +345,30 @@ private: if (inter_addr < start_address) { inter_addr = start_address; } - if (it->second <= 0) { - __debugbreak(); - } func(inter_addr, inter_addr_end, it->second); } } + void RemoveEachInOverlapCounter(OverlapCounter& current_range, const IntervalType search_interval, int subtract_value) { + bool any_removals = false; + current_range.add(std::make_pair(search_interval, subtract_value)); + do { + any_removals = false; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + if (it->second <= 0) { + any_removals = true; + current_range.erase(it); + break; + } + } + } while (any_removals); + } + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); @@ -554,6 +571,8 @@ private: u64 minimum_memory = 0; u64 critical_memory = 0; + bool active_async_buffers = false; + std::array> PAGE_BITS)> page_table; }; diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h index 782951fe7..21729752b 100644 --- a/src/video_core/buffer_cache/word_manager.h +++ b/src/video_core/buffer_cache/word_manager.h @@ -273,7 +273,7 @@ public: untracked_words[word_index] &= ~bits; NotifyRasterizer(word_index, current_bits, ~u64{0}); } - const u64 word = current_word; + const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); u64 page = page_begin; page_begin = 0; @@ -321,6 +321,7 @@ public: [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { static_assert(type != Type::Untracked); + const u64* const untracked_words = Array(); const u64* const state_words = Array(); const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 word_begin = offset / BYTES_PER_WORD; @@ -328,7 +329,8 @@ public: const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 word = state_words[word_index]; + const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; + const u64 word = state_words[word_index] & ~off_word; if (word == 0) { continue; } -- cgit v1.2.3