diff options
Diffstat (limited to 'src/video_core/buffer_cache/buffer_cache.h')
-rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 161 |
1 files changed, 116 insertions, 45 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d371b842f..6d04d00da 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -16,6 +16,7 @@ #include <boost/container/small_vector.hpp> +#include "common/common_sizes.h" #include "common/common_types.h" #include "common/div_ceil.h" #include "common/microprofile.h" @@ -65,6 +66,9 @@ class BufferCache { static constexpr BufferId NULL_BUFFER_ID{0}; + static constexpr u64 EXPECTED_MEMORY = Common::Size_512_MB; + static constexpr u64 CRITICAL_MEMORY = Common::Size_1_GB; + using Maxwell = Tegra::Engines::Maxwell3D::Regs; using Runtime = typename P::Runtime; @@ -102,6 +106,8 @@ public: void TickFrame(); + void RunGarbageCollector(); + void WriteMemory(VAddr cpu_addr, u64 size); void CachedWriteMemory(VAddr cpu_addr, u64 size); @@ -243,6 +249,8 @@ private: template <bool insert> void ChangeRegister(BufferId buffer_id); + void TouchBuffer(Buffer& buffer) const noexcept; + bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); @@ -255,6 +263,10 @@ private: void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); + void DownloadBufferMemory(Buffer& buffer_id); + + void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); + void DeleteBuffer(BufferId buffer_id); void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); @@ -319,6 +331,10 @@ private: size_t immediate_buffer_capacity = 0; std::unique_ptr<u8[]> immediate_buffer_alloc; + typename SlotVector<Buffer>::Iterator deletion_iterator; + u64 frame_tick = 0; + u64 total_used_memory = 0; + std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; }; @@ -332,6 +348,28 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); + deletion_iterator = slot_buffers.end(); +} + +template <class P> +void BufferCache<P>::RunGarbageCollector() { + const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY; + const u64 ticks_to_destroy = aggressive_gc ? 60 : 120; + int num_iterations = aggressive_gc ? 64 : 32; + for (; num_iterations > 0; --num_iterations) { + if (deletion_iterator == slot_buffers.end()) { + deletion_iterator = slot_buffers.begin(); + } + ++deletion_iterator; + if (deletion_iterator == slot_buffers.end()) { + break; + } + const auto [buffer_id, buffer] = *deletion_iterator; + if (buffer->FrameTick() + ticks_to_destroy < frame_tick) { + DownloadBufferMemory(*buffer); + DeleteBuffer(buffer_id); + } + } } template <class P> @@ -349,6 +387,10 @@ void BufferCache<P>::TickFrame() { const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; + if (Settings::values.use_caches_gc.GetValue() && total_used_memory >= EXPECTED_MEMORY) { + RunGarbageCollector(); + } + ++frame_tick; delayed_destruction_ring.Tick(); } @@ -371,50 +413,8 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { template <class P> void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { - ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { - boost::container::small_vector<BufferCopy, 1> copies; - u64 total_size_bytes = 0; - u64 largest_copy = 0; - buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { - copies.push_back(BufferCopy{ - .src_offset = range_offset, - .dst_offset = total_size_bytes, - .size = range_size, - }); - total_size_bytes += range_size; - largest_copy = std::max(largest_copy, range_size); - }); - if (total_size_bytes == 0) { - return; - } - MICROPROFILE_SCOPE(GPU_DownloadMemory); - - if constexpr (USE_MEMORY_MAPS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); - const u8* const mapped_memory = download_staging.mapped_span.data(); - const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); - for (BufferCopy& copy : copies) { - // Modify copies to have the staging offset in mind - copy.dst_offset += download_staging.offset; - } - runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); - runtime.Finish(); - for (const BufferCopy& copy : copies) { - const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; - // Undo the modified offset - const u64 dst_offset = copy.dst_offset - download_staging.offset; - const u8* copy_mapped_memory = mapped_memory + dst_offset; - cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); - } - } else { - const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); - for (const BufferCopy& copy : copies) { - buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); - const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; - cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); - } - } - }); + ForEachBufferInRange(cpu_addr, size, + [&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer); }); } template <class P> @@ -640,6 +640,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { template <class P> void BufferCache<P>::BindHostIndexBuffer() { Buffer& buffer = slot_buffers[index_buffer.buffer_id]; + TouchBuffer(buffer); const u32 offset = buffer.Offset(index_buffer.cpu_addr); const u32 size = index_buffer.size; SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); @@ -658,6 +659,7 @@ void BufferCache<P>::BindHostVertexBuffers() { for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { const Binding& binding = vertex_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); if (!flags[Dirty::VertexBuffer0 + index]) { continue; @@ -693,6 +695,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 const VAddr cpu_addr = binding.cpu_addr; const u32 size = binding.size; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && size <= uniform_buffer_skip_cache_size && !buffer.IsRegionGpuModified(cpu_addr, size); @@ -744,6 +747,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) { ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { const Binding& binding = storage_buffers[stage][index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -766,6 +770,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() { for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { const Binding& binding = transform_feedback_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -784,6 +789,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() { ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { const Binding& binding = compute_uniform_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -803,6 +809,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() { ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { const Binding& binding = compute_storage_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -1101,6 +1108,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); const u32 size = static_cast<u32>(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); + TouchBuffer(slot_buffers[new_buffer_id]); for (const BufferId overlap_id : overlap.ids) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } @@ -1122,8 +1130,14 @@ template <class P> template <bool insert> void BufferCache<P>::ChangeRegister(BufferId buffer_id) { const Buffer& buffer = slot_buffers[buffer_id]; + const auto size = buffer.SizeBytes(); + if (insert) { + total_used_memory += Common::AlignUp(size, 1024); + } else { + total_used_memory -= Common::AlignUp(size, 1024); + } const VAddr cpu_addr_begin = buffer.CpuAddr(); - const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); + const VAddr cpu_addr_end = cpu_addr_begin + size; const u64 page_begin = cpu_addr_begin / PAGE_SIZE; const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); for (u64 page = page_begin; page != page_end; ++page) { @@ -1136,6 +1150,11 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { } template <class P> +void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept { + buffer.SetFrameTick(frame_tick); +} + +template <class P> bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { if (buffer.CpuAddr() == 0) { return true; @@ -1212,6 +1231,57 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, } template <class P> +void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) { + DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes()); +} + +template <class P> +void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 size) { + boost::container::small_vector<BufferCopy, 1> copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = range_offset, + .dst_offset = total_size_bytes, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + if (total_size_bytes == 0) { + return; + } + MICROPROFILE_SCOPE(GPU_DownloadMemory); + + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + const u8* const mapped_memory = download_staging.mapped_span.data(); + const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); + for (BufferCopy& copy : copies) { + // Modify copies to have the staging offset in mind + copy.dst_offset += download_staging.offset; + } + runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); + runtime.Finish(); + for (const BufferCopy& copy : copies) { + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* copy_mapped_memory = mapped_memory + dst_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); + } + } else { + const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); + for (const BufferCopy& copy : copies) { + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); + } + } +} + +template <class P> void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { const auto scalar_replace = [buffer_id](Binding& binding) { if (binding.buffer_id == buffer_id) { @@ -1236,6 +1306,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { Unregister(buffer_id); delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); + slot_buffers.erase(buffer_id); NotifyBufferDeletion(); } |