From be1a3f7a0fb330b7cc5ac007ccb2cb73d4795602 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 10 Jul 2021 18:19:10 +0200 Subject: accelerateDMA: Accelerate Buffer Copies. --- src/video_core/buffer_cache/buffer_cache.h | 81 +++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) (limited to 'src/video_core/buffer_cache/buffer_cache.h') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 502feddba..c73ebb1f4 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -164,6 +164,8 @@ public: /// Pop asynchronous downloads void PopAsyncFlushes(); + [[nodiscard]] bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); + /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); @@ -430,6 +432,83 @@ void BufferCache

::DownloadMemory(VAddr cpu_addr, u64 size) { }); } +template +bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) { + const std::optional cpu_src_address = gpu_memory.GpuToCpuAddress(src_address); + const std::optional cpu_dest_address = gpu_memory.GpuToCpuAddress(dest_address); + if (!cpu_src_address || !cpu_dest_address) { + return false; + } + const bool source_dirty = IsRegionGpuModified(*cpu_src_address, amount); + const bool dest_dirty = IsRegionGpuModified(*cpu_dest_address, amount); + if (!(source_dirty || dest_dirty)) { + return false; + } + + const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; + common_ranges.subtract(subtract_interval); + + BufferId buffer_a; + BufferId buffer_b; + do { + has_deleted_buffers = false; + buffer_a = FindBuffer(*cpu_src_address, static_cast(amount)); + buffer_b = FindBuffer(*cpu_dest_address, static_cast(amount)); + } while (has_deleted_buffers); + auto& src_buffer = slot_buffers[buffer_a]; + auto& dest_buffer = slot_buffers[buffer_b]; + SynchronizeBuffer(src_buffer, *cpu_src_address, amount); + SynchronizeBuffer(dest_buffer, *cpu_dest_address, amount); + std::array copies{BufferCopy{ + .src_offset = src_buffer.Offset(*cpu_src_address), + .dst_offset = dest_buffer.Offset(*cpu_dest_address), + .size = amount, + }}; + + auto mirror = [&](VAddr base_address, u64 size) { + VAddr diff = base_address - *cpu_src_address; + VAddr new_base_address = *cpu_dest_address + diff; + const IntervalType add_interval{new_base_address, new_base_address + size}; + common_ranges.add(add_interval); + }; + + const VAddr start_address = *cpu_src_address; + const VAddr end_address = start_address + amount; + const IntervalType search_interval{start_address - amount, 1}; + auto it = common_ranges.lower_bound(search_interval); + if (it == common_ranges.end()) { + it = common_ranges.begin(); + } + while (it != common_ranges.end()) { + VAddr inter_addr_end = it->upper(); + VAddr inter_addr = it->lower(); + if (inter_addr >= end_address) { + break; + } + if (inter_addr_end <= start_address) { + it++; + continue; + } + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + mirror(inter_addr, inter_addr_end - inter_addr); + it++; + } + + runtime.CopyBuffer(dest_buffer, src_buffer, copies); + if (source_dirty) { + dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount); + } + std::vector tmp_buffer(amount); + cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); + cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount); + return true; +} + template void BufferCache

::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) { @@ -951,7 +1030,7 @@ void BufferCache

::UpdateIndexBuffer() { const GPUVAddr gpu_addr_end = index_array.EndAddress(); const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); - const u32 draw_size = index_array.count * index_array.FormatSizeInBytes(); + const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes(); const u32 size = std::min(address_size, draw_size); if (size == 0 || !cpu_addr) { index_buffer = NULL_BINDING; -- cgit v1.2.3 From bc19d289637ca4d1bd6c14e56953ce74ffba08c5 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 12 Jul 2021 04:10:42 +0200 Subject: accelerateDMA: Fixes and feedback. --- src/video_core/buffer_cache/buffer_cache.h | 137 ++++++++++++----------------- 1 file changed, 54 insertions(+), 83 deletions(-) (limited to 'src/video_core/buffer_cache/buffer_cache.h') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index c73ebb1f4..2871682f6 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -202,6 +202,36 @@ private: } } + template + void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const VAddr search_base = + static_cast(std::min(0LL, static_cast(start_address - size))); + const IntervalType search_interval{search_base, search_base + 1}; + auto it = common_ranges.lower_bound(search_interval); + if (it == common_ranges.end()) { + it = common_ranges.begin(); + } + for (; it != common_ranges.end(); it++) { + VAddr inter_addr_end = it->upper(); + VAddr inter_addr = it->lower(); + if (inter_addr >= end_address) { + break; + } + if (inter_addr_end <= start_address) { + continue; + } + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end); + } + } + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { return (cpu_addr & ~Core::Memory::PAGE_MASK) == ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); @@ -441,12 +471,15 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } const bool source_dirty = IsRegionGpuModified(*cpu_src_address, amount); const bool dest_dirty = IsRegionGpuModified(*cpu_dest_address, amount); - if (!(source_dirty || dest_dirty)) { + if (!source_dirty && !dest_dirty) { return false; } const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; - common_ranges.subtract(subtract_interval); + uncommitted_ranges.subtract(subtract_interval); + for (auto& interval_set : committed_ranges) { + interval_set.subtract(subtract_interval); + } BufferId buffer_a; BufferId buffer_b; @@ -457,46 +490,28 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } while (has_deleted_buffers); auto& src_buffer = slot_buffers[buffer_a]; auto& dest_buffer = slot_buffers[buffer_b]; - SynchronizeBuffer(src_buffer, *cpu_src_address, amount); - SynchronizeBuffer(dest_buffer, *cpu_dest_address, amount); + SynchronizeBuffer(src_buffer, *cpu_src_address, static_cast(amount)); + SynchronizeBuffer(dest_buffer, *cpu_dest_address, static_cast(amount)); std::array copies{BufferCopy{ .src_offset = src_buffer.Offset(*cpu_src_address), .dst_offset = dest_buffer.Offset(*cpu_dest_address), .size = amount, }}; - auto mirror = [&](VAddr base_address, u64 size) { - VAddr diff = base_address - *cpu_src_address; - VAddr new_base_address = *cpu_dest_address + diff; + boost::container::small_vector tmp_intervals; + auto mirror = [&](VAddr base_address, VAddr base_address_end) { + const u64 size = base_address_end - base_address; + const VAddr diff = base_address - *cpu_src_address; + const VAddr new_base_address = *cpu_dest_address + diff; const IntervalType add_interval{new_base_address, new_base_address + size}; - common_ranges.add(add_interval); + uncommitted_ranges.add(add_interval); + tmp_intervals.push_back(add_interval); }; - - const VAddr start_address = *cpu_src_address; - const VAddr end_address = start_address + amount; - const IntervalType search_interval{start_address - amount, 1}; - auto it = common_ranges.lower_bound(search_interval); - if (it == common_ranges.end()) { - it = common_ranges.begin(); - } - while (it != common_ranges.end()) { - VAddr inter_addr_end = it->upper(); - VAddr inter_addr = it->lower(); - if (inter_addr >= end_address) { - break; - } - if (inter_addr_end <= start_address) { - it++; - continue; - } - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - mirror(inter_addr, inter_addr_end - inter_addr); - it++; + ForEachWrittenRange(*cpu_src_address, amount, mirror); + // This subtraction in this order is important for overlapping copies. + common_ranges.subtract(subtract_interval); + for (const IntervalType add_interval : tmp_intervals) { + common_ranges.add(add_interval); } runtime.CopyBuffer(dest_buffer, src_buffer, copies); @@ -695,30 +710,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { const VAddr start_address = buffer_addr + range_offset; const VAddr end_address = start_address + range_size; - const IntervalType search_interval{cpu_addr, 1}; - auto it = common_ranges.lower_bound(search_interval); - if (it == common_ranges.end()) { - it = common_ranges.begin(); - } - while (it != common_ranges.end()) { - VAddr inter_addr_end = it->upper(); - VAddr inter_addr = it->lower(); - if (inter_addr >= end_address) { - break; - } - if (inter_addr_end <= start_address) { - it++; - continue; - } - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - add_download(inter_addr, inter_addr_end); - it++; - } + ForEachWrittenRange(start_address, range_size, add_download); const IntervalType subtract_interval{start_address, end_address}; common_ranges.subtract(subtract_interval); }); @@ -816,7 +808,9 @@ void BufferCache

::BindHostIndexBuffer() { const u32 size = index_buffer.size; SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { - runtime.BindIndexBuffer(buffer, offset, size); + const u32 new_offset = offset + maxwell3d.regs.index_array.first * + maxwell3d.regs.index_array.FormatSizeInBytes(); + runtime.BindIndexBuffer(buffer, new_offset, size); } else { runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format, maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count, @@ -1429,30 +1423,7 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si const VAddr start_address = buffer_addr + range_offset; const VAddr end_address = start_address + range_size; - const IntervalType search_interval{start_address - range_size, 1}; - auto it = common_ranges.lower_bound(search_interval); - if (it == common_ranges.end()) { - it = common_ranges.begin(); - } - while (it != common_ranges.end()) { - VAddr inter_addr_end = it->upper(); - VAddr inter_addr = it->lower(); - if (inter_addr >= end_address) { - break; - } - if (inter_addr_end <= start_address) { - it++; - continue; - } - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - add_download(inter_addr, inter_addr_end); - it++; - } + ForEachWrittenRange(start_address, range_size, add_download); const IntervalType subtract_interval{start_address, end_address}; common_ranges.subtract(subtract_interval); }); -- cgit v1.2.3