diff options
Diffstat (limited to 'src/video_core/buffer_cache/buffer_cache.h')
-rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 259 |
1 files changed, 153 insertions, 106 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index fa26eb8b0..8e26b3f95 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -1,17 +1,14 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. +// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later #pragma once #include <algorithm> #include <array> -#include <deque> #include <memory> #include <mutex> #include <numeric> #include <span> -#include <unordered_map> #include <vector> #include <boost/container/small_vector.hpp> @@ -22,10 +19,10 @@ #include "common/literals.h" #include "common/lru_cache.h" #include "common/microprofile.h" -#include "common/scope_exit.h" #include "common/settings.h" #include "core/memory.h" #include "video_core/buffer_cache/buffer_base.h" +#include "video_core/control/channel_state_cache.h" #include "video_core/delayed_destruction_ring.h" #include "video_core/dirty_flags.h" #include "video_core/engines/kepler_compute.h" @@ -59,12 +56,12 @@ using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFE using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>; template <typename P> -class BufferCache { +class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { // Page size for caching purposes. // This is unrelated to the CPU page size and it can be changed as it seems optimal. - static constexpr u32 PAGE_BITS = 16; - static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; + static constexpr u32 YUZU_PAGEBITS = 16; + static constexpr u64 YUZU_PAGESIZE = u64{1} << YUZU_PAGEBITS; static constexpr bool IS_OPENGL = P::IS_OPENGL; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = @@ -78,8 +75,9 @@ class BufferCache { static constexpr BufferId NULL_BUFFER_ID{0}; - static constexpr u64 EXPECTED_MEMORY = 512_MiB; - static constexpr u64 CRITICAL_MEMORY = 1_GiB; + static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; + static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; + static constexpr s64 TARGET_THRESHOLD = 4_GiB; using Maxwell = Tegra::Engines::Maxwell3D::Regs; @@ -118,10 +116,7 @@ public: static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB); explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, - Tegra::Engines::Maxwell3D& maxwell3d_, - Tegra::Engines::KeplerCompute& kepler_compute_, - Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, - Runtime& runtime_); + Core::Memory::Memory& cpu_memory_, Runtime& runtime_); void TickFrame(); @@ -131,7 +126,7 @@ public: void DownloadMemory(VAddr cpu_addr, u64 size); - bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<u8> inlined_buffer); + bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer); void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); @@ -218,8 +213,8 @@ private: template <typename Func> void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { - const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); - for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { + const u64 page_end = Common::DivCeil(cpu_addr + size, YUZU_PAGESIZE); + for (u64 page = cpu_addr >> YUZU_PAGEBITS; page < page_end;) { const BufferId buffer_id = page_table[page]; if (!buffer_id) { ++page; @@ -229,7 +224,7 @@ private: func(buffer_id, buffer); const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, PAGE_SIZE); + page = Common::DivCeil(end_addr, YUZU_PAGESIZE); } } @@ -264,8 +259,8 @@ private: } static bool IsRangeGranular(VAddr cpu_addr, size_t size) { - return (cpu_addr & ~Core::Memory::PAGE_MASK) == - ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); + return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == + ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); } void RunGarbageCollector(); @@ -355,7 +350,7 @@ private: void NotifyBufferDeletion(); - [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, bool is_written = false) const; [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, PixelFormat format); @@ -369,9 +364,6 @@ private: void ClearDownload(IntervalType subtract_interval); VideoCore::RasterizerInterface& rasterizer; - Tegra::Engines::Maxwell3D& maxwell3d; - Tegra::Engines::KeplerCompute& kepler_compute; - Tegra::MemoryManager& gpu_memory; Core::Memory::Memory& cpu_memory; SlotVector<Buffer> slot_buffers; @@ -438,26 +430,43 @@ private: Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache; u64 frame_tick = 0; u64 total_used_memory = 0; + u64 minimum_memory = 0; + u64 critical_memory = 0; - std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; + std::array<BufferId, ((1ULL << 39) >> YUZU_PAGEBITS)> page_table; }; template <class P> BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, - Tegra::Engines::Maxwell3D& maxwell3d_, - Tegra::Engines::KeplerCompute& kepler_compute_, - Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, - Runtime& runtime_) - : runtime{runtime_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, - kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_} { + Core::Memory::Memory& cpu_memory_, Runtime& runtime_) + : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); common_ranges.clear(); + + if (!runtime.CanReportMemoryUsage()) { + minimum_memory = DEFAULT_EXPECTED_MEMORY; + critical_memory = DEFAULT_CRITICAL_MEMORY; + return; + } + + const s64 device_memory = static_cast<s64>(runtime.GetDeviceLocalMemory()); + const s64 min_spacing_expected = device_memory - 1_GiB - 512_MiB; + const s64 min_spacing_critical = device_memory - 1_GiB; + const s64 mem_threshold = std::min(device_memory, TARGET_THRESHOLD); + const s64 min_vacancy_expected = (6 * mem_threshold) / 10; + const s64 min_vacancy_critical = (3 * mem_threshold) / 10; + minimum_memory = static_cast<u64>( + std::max(std::min(device_memory - min_vacancy_expected, min_spacing_expected), + DEFAULT_EXPECTED_MEMORY)); + critical_memory = static_cast<u64>( + std::max(std::min(device_memory - min_vacancy_critical, min_spacing_critical), + DEFAULT_CRITICAL_MEMORY)); } template <class P> void BufferCache<P>::RunGarbageCollector() { - const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY; + const bool aggressive_gc = total_used_memory >= critical_memory; const u64 ticks_to_destroy = aggressive_gc ? 60 : 120; int num_iterations = aggressive_gc ? 64 : 32; const auto clean_up = [this, &num_iterations](BufferId buffer_id) { @@ -488,7 +497,11 @@ void BufferCache<P>::TickFrame() { const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; - if (total_used_memory >= EXPECTED_MEMORY) { + // If we can obtain the memory info, use it instead of the estimate. + if (runtime.CanReportMemoryUsage()) { + total_used_memory = runtime.GetDeviceMemoryUsage(); + } + if (total_used_memory >= minimum_memory) { RunGarbageCollector(); } ++frame_tick; @@ -529,8 +542,8 @@ void BufferCache<P>::ClearDownload(IntervalType subtract_interval) { template <class P> bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) { - const std::optional<VAddr> cpu_src_address = gpu_memory.GpuToCpuAddress(src_address); - const std::optional<VAddr> cpu_dest_address = gpu_memory.GpuToCpuAddress(dest_address); + const std::optional<VAddr> cpu_src_address = gpu_memory->GpuToCpuAddress(src_address); + const std::optional<VAddr> cpu_dest_address = gpu_memory->GpuToCpuAddress(dest_address); if (!cpu_src_address || !cpu_dest_address) { return false; } @@ -588,7 +601,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am template <class P> bool BufferCache<P>::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) { - const std::optional<VAddr> cpu_dst_address = gpu_memory.GpuToCpuAddress(dst_address); + const std::optional<VAddr> cpu_dst_address = gpu_memory->GpuToCpuAddress(dst_address); if (!cpu_dst_address) { return false; } @@ -612,7 +625,7 @@ bool BufferCache<P>::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) { template <class P> void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) { - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); const Binding binding{ .cpu_addr = *cpu_addr, .size = size, @@ -650,7 +663,7 @@ void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) { if (is_indexed) { BindHostIndexBuffer(); } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { - const auto& regs = maxwell3d.regs; + const auto& regs = maxwell3d->regs; if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) { runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count); } @@ -710,9 +723,9 @@ void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, enabled_storage_buffers[stage] |= 1U << ssbo_index; written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index; - const auto& cbufs = maxwell3d.state.shader_stages[stage]; + const auto& cbufs = maxwell3d->state.shader_stages[stage]; const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset; - storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr); + storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr, is_written); } template <class P> @@ -747,12 +760,12 @@ void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, enabled_compute_storage_buffers |= 1U << ssbo_index; written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index; - const auto& launch_desc = kepler_compute.launch_description; + const auto& launch_desc = kepler_compute->launch_description; ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0); const auto& cbufs = launch_desc.const_buffer_config; const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset; - compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr); + compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr, is_written); } template <class P> @@ -813,6 +826,19 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { const bool is_accuracy_normal = Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; + auto it = committed_ranges.begin(); + while (it != committed_ranges.end()) { + auto& current_intervals = *it; + auto next_it = std::next(it); + while (next_it != committed_ranges.end()) { + for (auto& interval : *next_it) { + current_intervals.subtract(interval); + } + next_it++; + } + it++; + } + boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads; u64 total_size_bytes = 0; u64 largest_copy = 0; @@ -903,8 +929,8 @@ void BufferCache<P>::PopAsyncFlushes() {} template <class P> bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); - for (u64 page = addr >> PAGE_BITS; page < page_end;) { + const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); + for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { const BufferId image_id = page_table[page]; if (!image_id) { ++page; @@ -915,7 +941,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { return true; } const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, PAGE_SIZE); + page = Common::DivCeil(end_addr, YUZU_PAGESIZE); } return false; } @@ -923,8 +949,8 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { template <class P> bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; - const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE); - for (u64 page = addr >> PAGE_BITS; page < page_end;) { + const u64 page_end = Common::DivCeil(end_addr, YUZU_PAGESIZE); + for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { const BufferId buffer_id = page_table[page]; if (!buffer_id) { ++page; @@ -936,15 +962,15 @@ bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { if (buf_start_addr < end_addr && addr < buf_end_addr) { return true; } - page = Common::DivCeil(end_addr, PAGE_SIZE); + page = Common::DivCeil(end_addr, YUZU_PAGESIZE); } return false; } template <class P> bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); - for (u64 page = addr >> PAGE_BITS; page < page_end;) { + const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); + for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { const BufferId image_id = page_table[page]; if (!image_id) { ++page; @@ -955,7 +981,7 @@ bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) { return true; } const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, PAGE_SIZE); + page = Common::DivCeil(end_addr, YUZU_PAGESIZE); } return false; } @@ -968,19 +994,19 @@ void BufferCache<P>::BindHostIndexBuffer() { const u32 size = index_buffer.size; SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { - const u32 new_offset = offset + maxwell3d.regs.index_array.first * - maxwell3d.regs.index_array.FormatSizeInBytes(); + const u32 new_offset = offset + maxwell3d->regs.index_array.first * + maxwell3d->regs.index_array.FormatSizeInBytes(); runtime.BindIndexBuffer(buffer, new_offset, size); } else { - runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format, - maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count, - buffer, offset, size); + runtime.BindIndexBuffer(maxwell3d->regs.draw.topology, maxwell3d->regs.index_array.format, + maxwell3d->regs.index_array.first, + maxwell3d->regs.index_array.count, buffer, offset, size); } } template <class P> void BufferCache<P>::BindHostVertexBuffers() { - auto& flags = maxwell3d.dirty.flags; + auto& flags = maxwell3d->dirty.flags; for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { const Binding& binding = vertex_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; @@ -991,7 +1017,7 @@ void BufferCache<P>::BindHostVertexBuffers() { } flags[Dirty::VertexBuffer0 + index] = false; - const u32 stride = maxwell3d.regs.vertex_array[index].stride; + const u32 stride = maxwell3d->regs.vertex_array[index].stride; const u32 offset = buffer.Offset(binding.cpu_addr); runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride); } @@ -1131,7 +1157,7 @@ void BufferCache<P>::BindHostGraphicsTextureBuffers(size_t stage) { template <class P> void BufferCache<P>::BindHostTransformFeedbackBuffers() { - if (maxwell3d.regs.tfb_enabled == 0) { + if (maxwell3d->regs.tfb_enabled == 0) { return; } for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { @@ -1216,16 +1242,19 @@ void BufferCache<P>::BindHostComputeTextureBuffers() { template <class P> void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) { - if (is_indexed) { - UpdateIndexBuffer(); - } - UpdateVertexBuffers(); - UpdateTransformFeedbackBuffers(); - for (size_t stage = 0; stage < NUM_STAGES; ++stage) { - UpdateUniformBuffers(stage); - UpdateStorageBuffers(stage); - UpdateTextureBuffers(stage); - } + do { + has_deleted_buffers = false; + if (is_indexed) { + UpdateIndexBuffer(); + } + UpdateVertexBuffers(); + UpdateTransformFeedbackBuffers(); + for (size_t stage = 0; stage < NUM_STAGES; ++stage) { + UpdateUniformBuffers(stage); + UpdateStorageBuffers(stage); + UpdateTextureBuffers(stage); + } + } while (has_deleted_buffers); } template <class P> @@ -1239,8 +1268,8 @@ template <class P> void BufferCache<P>::UpdateIndexBuffer() { // We have to check for the dirty flags and index count // The index count is currently changed without updating the dirty flags - const auto& index_array = maxwell3d.regs.index_array; - auto& flags = maxwell3d.dirty.flags; + const auto& index_array = maxwell3d->regs.index_array; + auto& flags = maxwell3d->dirty.flags; if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) { return; } @@ -1249,7 +1278,7 @@ void BufferCache<P>::UpdateIndexBuffer() { const GPUVAddr gpu_addr_begin = index_array.StartAddress(); const GPUVAddr gpu_addr_end = index_array.EndAddress(); - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); + const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes(); const u32 size = std::min(address_size, draw_size); @@ -1266,8 +1295,8 @@ void BufferCache<P>::UpdateIndexBuffer() { template <class P> void BufferCache<P>::UpdateVertexBuffers() { - auto& flags = maxwell3d.dirty.flags; - if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) { + auto& flags = maxwell3d->dirty.flags; + if (!maxwell3d->dirty.flags[Dirty::VertexBuffers]) { return; } flags[Dirty::VertexBuffers] = false; @@ -1279,20 +1308,25 @@ void BufferCache<P>::UpdateVertexBuffers() { template <class P> void BufferCache<P>::UpdateVertexBuffer(u32 index) { - if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) { + if (!maxwell3d->dirty.flags[Dirty::VertexBuffer0 + index]) { return; } - const auto& array = maxwell3d.regs.vertex_array[index]; - const auto& limit = maxwell3d.regs.vertex_array_limit[index]; + const auto& array = maxwell3d->regs.vertex_array[index]; + const auto& limit = maxwell3d->regs.vertex_array_limit[index]; const GPUVAddr gpu_addr_begin = array.StartAddress(); const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1; - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); - const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); - const u32 size = address_size; // TODO: Analyze stride and number of vertices - if (array.enable == 0 || size == 0 || !cpu_addr) { + const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); + u32 address_size = static_cast<u32>( + std::min(gpu_addr_end - gpu_addr_begin, static_cast<u64>(std::numeric_limits<u32>::max()))); + if (array.enable == 0 || address_size == 0 || !cpu_addr) { vertex_buffers[index] = NULL_BINDING; return; } + if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) { + address_size = + static_cast<u32>(gpu_memory->MaxContinousRange(gpu_addr_begin, address_size)); + } + const u32 size = address_size; // TODO: Analyze stride and number of vertices vertex_buffers[index] = Binding{ .cpu_addr = *cpu_addr, .size = size, @@ -1346,7 +1380,7 @@ void BufferCache<P>::UpdateTextureBuffers(size_t stage) { template <class P> void BufferCache<P>::UpdateTransformFeedbackBuffers() { - if (maxwell3d.regs.tfb_enabled == 0) { + if (maxwell3d->regs.tfb_enabled == 0) { return; } for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { @@ -1356,10 +1390,10 @@ void BufferCache<P>::UpdateTransformFeedbackBuffers() { template <class P> void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) { - const auto& binding = maxwell3d.regs.tfb_bindings[index]; + const auto& binding = maxwell3d->regs.tfb_bindings[index]; const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset; const u32 size = binding.buffer_size; - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) { transform_feedback_buffers[index] = NULL_BINDING; return; @@ -1378,10 +1412,10 @@ void BufferCache<P>::UpdateComputeUniformBuffers() { ForEachEnabledBit(enabled_compute_uniform_buffer_mask, [&](u32 index) { Binding& binding = compute_uniform_buffers[index]; binding = NULL_BINDING; - const auto& launch_desc = kepler_compute.launch_description; + const auto& launch_desc = kepler_compute->launch_description; if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) { const auto& cbuf = launch_desc.const_buffer_config[index]; - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address()); + const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(cbuf.Address()); if (cpu_addr) { binding.cpu_addr = *cpu_addr; binding.size = cbuf.size; @@ -1436,7 +1470,7 @@ BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) { if (cpu_addr == 0) { return NULL_BUFFER_ID; } - const u64 page = cpu_addr >> PAGE_BITS; + const u64 page = cpu_addr >> YUZU_PAGEBITS; const BufferId buffer_id = page_table[page]; if (!buffer_id) { return CreateBuffer(cpu_addr, size); @@ -1457,8 +1491,9 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu VAddr end = cpu_addr + wanted_size; int stream_score = 0; bool has_stream_leap = false; - for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) { - const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; + for (; cpu_addr >> YUZU_PAGEBITS < Common::DivCeil(end, YUZU_PAGESIZE); + cpu_addr += YUZU_PAGESIZE) { + const BufferId overlap_id = page_table[cpu_addr >> YUZU_PAGEBITS]; if (!overlap_id) { continue; } @@ -1469,19 +1504,27 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu overlap_ids.push_back(overlap_id); overlap.Pick(); const VAddr overlap_cpu_addr = overlap.CpuAddr(); - if (overlap_cpu_addr < begin) { + const bool expands_left = overlap_cpu_addr < begin; + if (expands_left) { cpu_addr = begin = overlap_cpu_addr; } - end = std::max(end, overlap_cpu_addr + overlap.SizeBytes()); - + const VAddr overlap_end = overlap_cpu_addr + overlap.SizeBytes(); + const bool expands_right = overlap_end > end; + if (overlap_end > end) { + end = overlap_end; + } stream_score += overlap.StreamScore(); if (stream_score > STREAM_LEAP_THRESHOLD && !has_stream_leap) { // When this memory region has been joined a bunch of times, we assume it's being used // as a stream buffer. Increase the size to skip constantly recreating buffers. has_stream_leap = true; - begin -= PAGE_SIZE * 256; - cpu_addr = begin; - end += PAGE_SIZE * 256; + if (expands_right) { + begin -= YUZU_PAGESIZE * 256; + cpu_addr = begin; + } + if (expands_left) { + end += YUZU_PAGESIZE * 256; + } } } return OverlapResult{ @@ -1522,6 +1565,8 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); const u32 size = static_cast<u32>(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); + auto& new_buffer = slot_buffers[new_buffer_id]; + runtime.ClearBuffer(new_buffer, 0, new_buffer.SizeBytes(), 0); for (const BufferId overlap_id : overlap.ids) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } @@ -1554,8 +1599,8 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { } const VAddr cpu_addr_begin = buffer.CpuAddr(); const VAddr cpu_addr_end = cpu_addr_begin + size; - const u64 page_begin = cpu_addr_begin / PAGE_SIZE; - const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); + const u64 page_begin = cpu_addr_begin / YUZU_PAGESIZE; + const u64 page_end = Common::DivCeil(cpu_addr_end, YUZU_PAGESIZE); for (u64 page = page_begin; page != page_end; ++page) { if constexpr (insert) { page_table[page] = buffer_id; @@ -1650,7 +1695,7 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, template <class P> bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size, - std::span<u8> inlined_buffer) { + std::span<const u8> inlined_buffer) { const bool is_dirty = IsRegionRegistered(dest_address, copy_size); if (!is_dirty) { return false; @@ -1786,7 +1831,7 @@ void BufferCache<P>::NotifyBufferDeletion() { dirty_uniform_buffers.fill(~u32{0}); uniform_buffer_binding_sizes.fill({}); } - auto& flags = maxwell3d.dirty.flags; + auto& flags = maxwell3d->dirty.flags; flags[Dirty::IndexBuffer] = true; flags[Dirty::VertexBuffers] = true; for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { @@ -1796,16 +1841,18 @@ void BufferCache<P>::NotifyBufferDeletion() { } template <class P> -typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const { - const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr); - const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8); - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); +typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr, + bool is_written) const { + const GPUVAddr gpu_addr = gpu_memory->Read<u64>(ssbo_addr); + const u32 size = gpu_memory->Read<u32>(ssbo_addr + 8); + const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); if (!cpu_addr || size == 0) { return NULL_BINDING; } + const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE); const Binding binding{ .cpu_addr = *cpu_addr, - .size = size, + .size = is_written ? size : static_cast<u32>(cpu_end - *cpu_addr), .buffer_id = BufferId{}, }; return binding; @@ -1814,7 +1861,7 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s template <class P> typename BufferCache<P>::TextureBufferBinding BufferCache<P>::GetTextureBufferBinding( GPUVAddr gpu_addr, u32 size, PixelFormat format) { - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); TextureBufferBinding binding; if (!cpu_addr || size == 0) { binding.cpu_addr = 0; |