diff options
Diffstat (limited to 'src/video_core/texture_cache/texture_cache.h')
-rw-r--r-- | src/video_core/texture_cache/texture_cache.h | 691 |
1 files changed, 534 insertions, 157 deletions
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 1b01990a4..d3f03a995 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1,9 +1,10 @@ -// SPDX-FileCopyrightText: 2021 yuzu Emulator Project +// SPDX-FileCopyrightText: 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-3.0-or-later #pragma once #include <unordered_set> +#include <boost/container/small_vector.hpp> #include "common/alignment.h" #include "common/settings.h" @@ -17,15 +18,10 @@ namespace VideoCommon { -using Tegra::Texture::SwizzleSource; -using Tegra::Texture::TextureType; using Tegra::Texture::TICEntry; using Tegra::Texture::TSCEntry; using VideoCore::Surface::GetFormatType; -using VideoCore::Surface::IsCopyCompatible; using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::PixelFormatFromDepthFormat; -using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using VideoCore::Surface::SurfaceType; using namespace Common::Literals; @@ -53,8 +49,8 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& if constexpr (HAS_DEVICE_MEMORY_INFO) { const s64 device_memory = static_cast<s64>(runtime.GetDeviceLocalMemory()); - const s64 min_spacing_expected = device_memory - 1_GiB - 512_MiB; - const s64 min_spacing_critical = device_memory - 1_GiB; + const s64 min_spacing_expected = device_memory - 1_GiB; + const s64 min_spacing_critical = device_memory - 512_MiB; const s64 mem_threshold = std::min(device_memory, TARGET_THRESHOLD); const s64 min_vacancy_expected = (6 * mem_threshold) / 10; const s64 min_vacancy_critical = (3 * mem_threshold) / 10; @@ -85,10 +81,17 @@ void TextureCache<P>::RunGarbageCollector() { } --num_iterations; auto& image = slot_images[image_id]; + if (True(image.flags & ImageFlagBits::IsDecoding)) { + // This image is still being decoded, deleting it will invalidate the slot + // used by the async decoder thread. + return false; + } + if (!aggressive_mode && True(image.flags & ImageFlagBits::CostlyLoad)) { + return false; + } const bool must_download = image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap); - if (!high_priority_mode && - (must_download || True(image.flags & ImageFlagBits::CostlyLoad))) { + if (!high_priority_mode && must_download) { return false; } if (must_download) { @@ -133,9 +136,17 @@ void TextureCache<P>::TickFrame() { sentenced_images.Tick(); sentenced_framebuffers.Tick(); sentenced_image_view.Tick(); + TickAsyncDecode(); + runtime.TickFrame(); - critical_gc = 0; ++frame_tick; + + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + for (auto& buffer : async_buffers_death_ring) { + runtime.FreeDeferredStagingBuffer(buffer); + } + async_buffers_death_ring.clear(); + } } template <class P> @@ -174,31 +185,91 @@ void TextureCache<P>::FillComputeImageViews(std::span<ImageViewInOut> views) { } template <class P> +void TextureCache<P>::CheckFeedbackLoop(std::span<const ImageViewInOut> views) { + if (!Settings::values.barrier_feedback_loops.GetValue()) { + return; + } + + const bool requires_barrier = [&] { + for (const auto& view : views) { + if (!view.id) { + continue; + } + auto& image_view = slot_image_views[view.id]; + + // Check color targets + for (const auto& ct_view_id : render_targets.color_buffer_ids) { + if (ct_view_id) { + auto& ct_view = slot_image_views[ct_view_id]; + if (image_view.image_id == ct_view.image_id) { + return true; + } + } + } + + // Check zeta target + if (render_targets.depth_buffer_id) { + auto& zt_view = slot_image_views[render_targets.depth_buffer_id]; + if (image_view.image_id == zt_view.image_id) { + return true; + } + } + } + + return false; + }(); + + if (requires_barrier) { + runtime.BarrierFeedbackLoop(); + } +} + +template <class P> typename P::Sampler* TextureCache<P>::GetGraphicsSampler(u32 index) { + return &slot_samplers[GetGraphicsSamplerId(index)]; +} + +template <class P> +typename P::Sampler* TextureCache<P>::GetComputeSampler(u32 index) { + return &slot_samplers[GetComputeSamplerId(index)]; +} + +template <class P> +SamplerId TextureCache<P>::GetGraphicsSamplerId(u32 index) { if (index > channel_state->graphics_sampler_table.Limit()) { LOG_DEBUG(HW_GPU, "Invalid sampler index={}", index); - return &slot_samplers[NULL_SAMPLER_ID]; + return NULL_SAMPLER_ID; } const auto [descriptor, is_new] = channel_state->graphics_sampler_table.Read(index); SamplerId& id = channel_state->graphics_sampler_ids[index]; if (is_new) { id = FindSampler(descriptor); } - return &slot_samplers[id]; + return id; } template <class P> -typename P::Sampler* TextureCache<P>::GetComputeSampler(u32 index) { +SamplerId TextureCache<P>::GetComputeSamplerId(u32 index) { if (index > channel_state->compute_sampler_table.Limit()) { LOG_DEBUG(HW_GPU, "Invalid sampler index={}", index); - return &slot_samplers[NULL_SAMPLER_ID]; + return NULL_SAMPLER_ID; } const auto [descriptor, is_new] = channel_state->compute_sampler_table.Read(index); SamplerId& id = channel_state->compute_sampler_ids[index]; if (is_new) { id = FindSampler(descriptor); } - return &slot_samplers[id]; + return id; +} + +template <class P> +const typename P::Sampler& TextureCache<P>::GetSampler(SamplerId id) const noexcept { + return slot_samplers[id]; +} + +template <class P> +typename P::Sampler& TextureCache<P>::GetSampler(SamplerId id) noexcept { + return slot_samplers[id]; } template <class P> @@ -233,7 +304,7 @@ void TextureCache<P>::SynchronizeComputeDescriptors() { } template <class P> -bool TextureCache<P>::RescaleRenderTargets(bool is_clear) { +bool TextureCache<P>::RescaleRenderTargets() { auto& flags = maxwell3d->dirty.flags; u32 scale_rating = 0; bool rescaled = false; @@ -271,13 +342,13 @@ bool TextureCache<P>::RescaleRenderTargets(bool is_clear) { ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; if (flags[Dirty::ColorBuffer0 + index] || force) { flags[Dirty::ColorBuffer0 + index] = false; - BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear)); + BindRenderTarget(&color_buffer_id, FindColorBuffer(index)); } check_rescale(color_buffer_id, tmp_color_images[index]); } if (flags[Dirty::ZetaBuffer] || force) { flags[Dirty::ZetaBuffer] = false; - BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); + BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer()); } check_rescale(render_targets.depth_buffer_id, tmp_depth_image); @@ -342,7 +413,7 @@ void TextureCache<P>::UpdateRenderTargets(bool is_clear) { return; } - const bool rescaled = RescaleRenderTargets(is_clear); + const bool rescaled = RescaleRenderTargets(); if (is_rescaling != rescaled) { flags[Dirty::RescaleViewports] = true; flags[Dirty::RescaleScissors] = true; @@ -455,7 +526,7 @@ void TextureCache<P>::WriteMemory(VAddr cpu_addr, size_t size) { template <class P> void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) { - std::vector<ImageId> images; + boost::container::small_vector<ImageId, 16> images; ForEachImageInRegion(cpu_addr, size, [&images](ImageId image_id, ImageBase& image) { if (!image.IsSafeDownload()) { return; @@ -481,8 +552,34 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) { } template <class P> +std::optional<VideoCore::RasterizerDownloadArea> TextureCache<P>::GetFlushArea(VAddr cpu_addr, + u64 size) { + std::optional<VideoCore::RasterizerDownloadArea> area{}; + ForEachImageInRegion(cpu_addr, size, [&](ImageId, ImageBase& image) { + if (False(image.flags & ImageFlagBits::GpuModified)) { + return; + } + if (!area) { + area.emplace(); + area->start_address = cpu_addr; + area->end_address = cpu_addr + size; + area->preemtive = true; + } + area->start_address = std::min(area->start_address, image.cpu_addr); + area->end_address = std::max(area->end_address, image.cpu_addr_end); + for (auto image_view_id : image.image_view_ids) { + auto& image_view = slot_image_views[image_view_id]; + image_view.flags |= ImageViewFlagBits::PreemtiveDownload; + } + area->preemtive &= image.info.forced_flushed; + image.info.forced_flushed = true; + }); + return area; +} + +template <class P> void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) { - std::vector<ImageId> deleted_images; + boost::container::small_vector<ImageId, 16> deleted_images; ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { Image& image = slot_images[id]; @@ -496,7 +593,7 @@ void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) { template <class P> void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) { - std::vector<ImageId> deleted_images; + boost::container::small_vector<ImageId, 16> deleted_images; ForEachImageInRegionGPU(as_id, gpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { @@ -654,25 +751,41 @@ template <class P> void TextureCache<P>::CommitAsyncFlushes() { // This is intentionally passing the value by copy if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - const std::span<const ImageId> download_ids = uncommitted_downloads; + auto& download_ids = uncommitted_downloads; if (download_ids.empty()) { committed_downloads.emplace_back(std::move(uncommitted_downloads)); uncommitted_downloads.clear(); - async_buffers.emplace_back(std::optional<AsyncBuffer>{}); + async_buffers.emplace_back(std::move(uncommitted_async_buffers)); + uncommitted_async_buffers.clear(); return; } size_t total_size_bytes = 0; - for (const ImageId image_id : download_ids) { - total_size_bytes += slot_images[image_id].unswizzled_size_bytes; + size_t last_async_buffer_id = uncommitted_async_buffers.size(); + bool any_none_dma = false; + for (PendingDownload& download_info : download_ids) { + if (download_info.is_swizzle) { + total_size_bytes += + Common::AlignUp(slot_images[download_info.object_id].unswizzled_size_bytes, 64); + any_none_dma = true; + download_info.async_buffer_id = last_async_buffer_id; + } } - auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true); - for (const ImageId image_id : download_ids) { - Image& image = slot_images[image_id]; - const auto copies = FullDownloadCopies(image.info); - image.DownloadMemory(download_map, copies); - download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64); + + if (any_none_dma) { + auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true); + for (const PendingDownload& download_info : download_ids) { + if (download_info.is_swizzle) { + Image& image = slot_images[download_info.object_id]; + const auto copies = FullDownloadCopies(image.info); + image.DownloadMemory(download_map, copies); + download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64); + } + } + uncommitted_async_buffers.emplace_back(download_map); } - async_buffers.emplace_back(download_map); + + async_buffers.emplace_back(std::move(uncommitted_async_buffers)); + uncommitted_async_buffers.clear(); } committed_downloads.emplace_back(std::move(uncommitted_downloads)); uncommitted_downloads.clear(); @@ -684,39 +797,57 @@ void TextureCache<P>::PopAsyncFlushes() { return; } if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - const std::span<const ImageId> download_ids = committed_downloads.front(); + const auto& download_ids = committed_downloads.front(); if (download_ids.empty()) { committed_downloads.pop_front(); async_buffers.pop_front(); return; } - auto download_map = *async_buffers.front(); - std::span<u8> download_span = download_map.mapped_span; + auto download_map = std::move(async_buffers.front()); for (size_t i = download_ids.size(); i > 0; i--) { - const ImageBase& image = slot_images[download_ids[i - 1]]; - const auto copies = FullDownloadCopies(image.info); - download_map.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64); - std::span<u8> download_span_alt = download_span.subspan(download_map.offset); - SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span_alt, - swizzle_data_buffer); + auto& download_info = download_ids[i - 1]; + auto& download_buffer = download_map[download_info.async_buffer_id]; + if (download_info.is_swizzle) { + const ImageBase& image = slot_images[download_info.object_id]; + const auto copies = FullDownloadCopies(image.info); + download_buffer.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64); + std::span<u8> download_span = + download_buffer.mapped_span.subspan(download_buffer.offset); + SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span, + swizzle_data_buffer); + } else { + const BufferDownload& buffer_info = slot_buffer_downloads[download_info.object_id]; + std::span<u8> download_span = + download_buffer.mapped_span.subspan(download_buffer.offset); + gpu_memory->WriteBlockUnsafe(buffer_info.address, download_span.data(), + buffer_info.size); + slot_buffer_downloads.erase(download_info.object_id); + } + } + for (auto& download_buffer : download_map) { + async_buffers_death_ring.emplace_back(download_buffer); } - runtime.FreeDeferredStagingBuffer(download_map); committed_downloads.pop_front(); async_buffers.pop_front(); } else { - const std::span<const ImageId> download_ids = committed_downloads.front(); + const auto& download_ids = committed_downloads.front(); if (download_ids.empty()) { committed_downloads.pop_front(); return; } size_t total_size_bytes = 0; - for (const ImageId image_id : download_ids) { - total_size_bytes += slot_images[image_id].unswizzled_size_bytes; + for (const PendingDownload& download_info : download_ids) { + if (download_info.is_swizzle) { + total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes; + } } auto download_map = runtime.DownloadStagingBuffer(total_size_bytes); const size_t original_offset = download_map.offset; - for (const ImageId image_id : download_ids) { - Image& image = slot_images[image_id]; + for (const PendingDownload& download_info : download_ids) { + if (!download_info.is_swizzle) { + continue; + } + Image& image = slot_images[download_info.object_id]; const auto copies = FullDownloadCopies(image.info); image.DownloadMemory(download_map, copies); download_map.offset += image.unswizzled_size_bytes; @@ -725,8 +856,11 @@ void TextureCache<P>::PopAsyncFlushes() { runtime.Finish(); download_map.offset = original_offset; std::span<u8> download_span = download_map.mapped_span; - for (const ImageId image_id : download_ids) { - const ImageBase& image = slot_images[image_id]; + for (const PendingDownload& download_info : download_ids) { + if (!download_info.is_swizzle) { + continue; + } + const ImageBase& image = slot_images[download_info.object_id]; const auto copies = FullDownloadCopies(image.info); SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span, swizzle_data_buffer); @@ -738,6 +872,26 @@ void TextureCache<P>::PopAsyncFlushes() { } template <class P> +ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand, bool is_upload) { + const ImageInfo dst_info(operand); + const ImageId image_id = FindDMAImage(dst_info, operand.address); + if (!image_id) { + return NULL_IMAGE_ID; + } + auto& image = slot_images[image_id]; + if (!is_upload && !image.info.dma_downloaded) { + // Force a full sync. + image.info.dma_downloaded = true; + return NULL_IMAGE_ID; + } + const auto base = image.TryFindBase(operand.address); + if (!base) { + return NULL_IMAGE_ID; + } + return image_id; +} + +template <class P> bool TextureCache<P>::IsRescaling() const noexcept { return is_rescaling; } @@ -765,6 +919,76 @@ bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { } template <class P> +std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::DmaBufferImageCopy( + const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand, + const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image) { + const auto [level, base] = PrepareDmaImage(image_id, image_operand.address, modifies_image); + auto* image = &slot_images[image_id]; + const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); + const u32 bpp = VideoCore::Surface::BytesPerBlock(image->info.format); + const auto convert = [old_bpp = image_operand.bytes_per_pixel, bpp](u32 value) { + return (old_bpp * value) / bpp; + }; + const u32 base_x = convert(image_operand.params.origin.x.Value()); + const u32 base_y = image_operand.params.origin.y.Value(); + const u32 length_x = convert(copy_info.length_x); + const u32 length_y = copy_info.length_y; + + const BufferImageCopy copy{ + .buffer_offset = 0, + .buffer_size = buffer_size, + .buffer_row_length = convert(buffer_operand.pitch), + .buffer_image_height = buffer_operand.height, + .image_subresource = + { + .base_level = static_cast<s32>(level), + .base_layer = static_cast<s32>(base), + .num_layers = 1, + }, + .image_offset = + { + .x = static_cast<s32>(base_x), + .y = static_cast<s32>(base_y), + .z = 0, + }, + .image_extent = + { + .width = length_x, + .height = length_y, + .depth = 1, + }, + }; + return {image, copy}; +} + +template <class P> +void TextureCache<P>::DownloadImageIntoBuffer(typename TextureCache<P>::Image* image, + typename TextureCache<P>::BufferType buffer, + size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies, + GPUVAddr address, size_t size) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + const BufferDownload new_buffer_download{address, size}; + auto slot = slot_buffer_downloads.insert(new_buffer_download); + const PendingDownload new_download{false, uncommitted_async_buffers.size(), slot}; + uncommitted_downloads.emplace_back(new_download); + auto download_map = runtime.DownloadStagingBuffer(size, true); + uncommitted_async_buffers.emplace_back(download_map); + std::array buffers{ + buffer, + download_map.buffer, + }; + std::array<u64, 2> buffer_offsets{ + buffer_offset, + download_map.offset, + }; + image->DownloadMemory(buffers, buffer_offsets, copies); + } else { + image->DownloadMemory(buffer, buffer_offset, copies); + } +} + +template <class P> void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) { if (False(image.flags & ImageFlagBits::CpuModified)) { // Only upload modified images @@ -773,10 +997,14 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) { image.flags &= ~ImageFlagBits::CpuModified; TrackImage(image, image_id); - if (image.info.num_samples > 1) { + if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) { LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); return; } + if (True(image.flags & ImageFlagBits::AsynchronousDecode)) { + QueueAsyncDecode(image, image_id); + return; + } auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); UploadImageContents(image, staging); runtime.InsertUploadMemoryBarrier(); @@ -873,7 +1101,7 @@ ImageId TextureCache<P>::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, const bool native_bgr = runtime.HasNativeBgr(); const bool flexible_formats = True(options & RelaxedOptions::Format); ImageId image_id{}; - boost::container::small_vector<ImageId, 1> image_ids; + boost::container::small_vector<ImageId, 8> image_ids; const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { if (True(existing_image.flags & ImageFlagBits::Remapped)) { return false; @@ -990,6 +1218,65 @@ u64 TextureCache<P>::GetScaledImageSizeBytes(const ImageBase& image) { } template <class P> +void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) { + UNIMPLEMENTED_IF(False(image.flags & ImageFlagBits::Converted)); + LOG_INFO(HW_GPU, "Queuing async texture decode"); + + image.flags |= ImageFlagBits::IsDecoding; + auto decode = std::make_unique<AsyncDecodeContext>(); + auto* decode_ptr = decode.get(); + decode->image_id = image_id; + async_decodes.push_back(std::move(decode)); + + Common::ScratchBuffer<u8> local_unswizzle_data_buffer(image.unswizzled_size_bytes); + const size_t guest_size_bytes = image.guest_size_bytes; + swizzle_data_buffer.resize_destructive(guest_size_bytes); + gpu_memory->ReadBlockUnsafe(image.gpu_addr, swizzle_data_buffer.data(), guest_size_bytes); + auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data_buffer, + local_unswizzle_data_buffer); + const size_t out_size = MapSizeBytes(image); + + auto func = [out_size, copies, info = image.info, + input = std::move(local_unswizzle_data_buffer), + async_decode = decode_ptr]() mutable { + async_decode->decoded_data.resize_destructive(out_size); + std::span copies_span{copies.data(), copies.size()}; + ConvertImage(input, info, async_decode->decoded_data, copies_span); + + // TODO: Do we need this lock? + std::unique_lock lock{async_decode->mutex}; + async_decode->copies = std::move(copies); + async_decode->complete = true; + }; + texture_decode_worker.QueueWork(std::move(func)); +} + +template <class P> +void TextureCache<P>::TickAsyncDecode() { + bool has_uploads{}; + auto i = async_decodes.begin(); + while (i != async_decodes.end()) { + auto* async_decode = i->get(); + std::unique_lock lock{async_decode->mutex}; + if (!async_decode->complete) { + ++i; + continue; + } + Image& image = slot_images[async_decode->image_id]; + auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); + std::memcpy(staging.mapped_span.data(), async_decode->decoded_data.data(), + async_decode->decoded_data.size()); + image.UploadMemory(staging, async_decode->copies); + image.flags &= ~ImageFlagBits::IsDecoding; + has_uploads = true; + i = async_decodes.erase(i); + } + if (has_uploads) { + runtime.InsertUploadMemoryBarrier(); + } +} + +template <class P> bool TextureCache<P>::ScaleUp(Image& image) { const bool has_copy = image.HasScaled(); const bool rescaled = image.ScaleUp(); @@ -1044,17 +1331,18 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA const size_t size_bytes = CalculateGuestSizeInBytes(new_info); const bool broken_views = runtime.HasBrokenTextureViewFormats(); const bool native_bgr = runtime.HasNativeBgr(); - std::vector<ImageId> overlap_ids; - std::unordered_set<ImageId> overlaps_found; - std::vector<ImageId> left_aliased_ids; - std::vector<ImageId> right_aliased_ids; - std::unordered_set<ImageId> ignore_textures; - std::vector<ImageId> bad_overlap_ids; - std::vector<ImageId> all_siblings; + join_overlap_ids.clear(); + join_overlaps_found.clear(); + join_left_aliased_ids.clear(); + join_right_aliased_ids.clear(); + join_ignore_textures.clear(); + join_bad_overlap_ids.clear(); + join_copies_to_do.clear(); + join_alias_indices.clear(); const bool this_is_linear = info.type == ImageType::Linear; const auto region_check = [&](ImageId overlap_id, ImageBase& overlap) { if (True(overlap.flags & ImageFlagBits::Remapped)) { - ignore_textures.insert(overlap_id); + join_ignore_textures.insert(overlap_id); return; } const bool overlap_is_linear = overlap.info.type == ImageType::Linear; @@ -1064,11 +1352,11 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA if (this_is_linear && overlap_is_linear) { if (info.pitch == overlap.info.pitch && gpu_addr == overlap.gpu_addr) { // Alias linear images with the same pitch - left_aliased_ids.push_back(overlap_id); + join_left_aliased_ids.push_back(overlap_id); } return; } - overlaps_found.insert(overlap_id); + join_overlaps_found.insert(overlap_id); static constexpr bool strict_size = true; const std::optional<OverlapResult> solution = ResolveOverlap( new_info, gpu_addr, cpu_addr, overlap, strict_size, broken_views, native_bgr); @@ -1076,34 +1364,33 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA gpu_addr = solution->gpu_addr; cpu_addr = solution->cpu_addr; new_info.resources = solution->resources; - overlap_ids.push_back(overlap_id); - all_siblings.push_back(overlap_id); + join_overlap_ids.push_back(overlap_id); + join_copies_to_do.emplace_back(JoinCopy{false, overlap_id}); return; } static constexpr auto options = RelaxedOptions::Size | RelaxedOptions::Format; const ImageBase new_image_base(new_info, gpu_addr, cpu_addr); if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) { - left_aliased_ids.push_back(overlap_id); + join_left_aliased_ids.push_back(overlap_id); overlap.flags |= ImageFlagBits::Alias; - all_siblings.push_back(overlap_id); + join_copies_to_do.emplace_back(JoinCopy{true, overlap_id}); } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options, broken_views, native_bgr)) { - right_aliased_ids.push_back(overlap_id); + join_right_aliased_ids.push_back(overlap_id); overlap.flags |= ImageFlagBits::Alias; - all_siblings.push_back(overlap_id); + join_copies_to_do.emplace_back(JoinCopy{true, overlap_id}); } else { - bad_overlap_ids.push_back(overlap_id); - overlap.flags |= ImageFlagBits::BadOverlap; + join_bad_overlap_ids.push_back(overlap_id); } }; ForEachImageInRegion(cpu_addr, size_bytes, region_check); const auto region_check_gpu = [&](ImageId overlap_id, ImageBase& overlap) { - if (!overlaps_found.contains(overlap_id)) { + if (!join_overlaps_found.contains(overlap_id)) { if (True(overlap.flags & ImageFlagBits::Remapped)) { - ignore_textures.insert(overlap_id); + join_ignore_textures.insert(overlap_id); } if (overlap.gpu_addr == gpu_addr && overlap.guest_size_bytes == size_bytes) { - ignore_textures.insert(overlap_id); + join_ignore_textures.insert(overlap_id); } } }; @@ -1111,11 +1398,11 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA bool can_rescale = info.rescaleable; bool any_rescaled = false; - for (const ImageId sibling_id : all_siblings) { + for (const auto& copy : join_copies_to_do) { if (!can_rescale) { break; } - Image& sibling = slot_images[sibling_id]; + Image& sibling = slot_images[copy.id]; can_rescale &= ImageCanRescale(sibling); any_rescaled |= True(sibling.flags & ImageFlagBits::Rescaled); } @@ -1123,13 +1410,13 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA can_rescale &= any_rescaled; if (can_rescale) { - for (const ImageId sibling_id : all_siblings) { - Image& sibling = slot_images[sibling_id]; + for (const auto& copy : join_copies_to_do) { + Image& sibling = slot_images[copy.id]; ScaleUp(sibling); } } else { - for (const ImageId sibling_id : all_siblings) { - Image& sibling = slot_images[sibling_id]; + for (const auto& copy : join_copies_to_do) { + Image& sibling = slot_images[copy.id]; ScaleDown(sibling); } } @@ -1137,11 +1424,11 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); Image& new_image = slot_images[new_image_id]; - if (!gpu_memory->IsContinousRange(new_image.gpu_addr, new_image.guest_size_bytes)) { + if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes)) { new_image.flags |= ImageFlagBits::Sparse; } - for (const ImageId overlap_id : ignore_textures) { + for (const ImageId overlap_id : join_ignore_textures) { Image& overlap = slot_images[overlap_id]; if (True(overlap.flags & ImageFlagBits::GpuModified)) { UNIMPLEMENTED(); @@ -1162,44 +1449,81 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA ScaleDown(new_image); } - for (const ImageId overlap_id : overlap_ids) { - Image& overlap = slot_images[overlap_id]; - if (True(overlap.flags & ImageFlagBits::GpuModified)) { - new_image.flags |= ImageFlagBits::GpuModified; - } - if (overlap.info.num_samples != new_image.info.num_samples) { - LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented"); - } else { - const auto& resolution = Settings::values.resolution_info; - const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value(); - const u32 up_scale = can_rescale ? resolution.up_scale : 1; - const u32 down_shift = can_rescale ? resolution.down_shift : 0; - auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift); - runtime.CopyImage(new_image, overlap, std::move(copies)); - } - if (True(overlap.flags & ImageFlagBits::Tracked)) { - UntrackImage(overlap, overlap_id); - } - UnregisterImage(overlap_id); - DeleteImage(overlap_id); - } + std::ranges::sort(join_copies_to_do, [this](const JoinCopy& lhs, const JoinCopy& rhs) { + const ImageBase& lhs_image = slot_images[lhs.id]; + const ImageBase& rhs_image = slot_images[rhs.id]; + return lhs_image.modification_tick < rhs_image.modification_tick; + }); + ImageBase& new_image_base = new_image; - for (const ImageId aliased_id : right_aliased_ids) { + for (const ImageId aliased_id : join_right_aliased_ids) { ImageBase& aliased = slot_images[aliased_id]; - AddImageAlias(new_image_base, aliased, new_image_id, aliased_id); + size_t alias_index = new_image_base.aliased_images.size(); + if (!AddImageAlias(new_image_base, aliased, new_image_id, aliased_id)) { + continue; + } + join_alias_indices.emplace(aliased_id, alias_index); new_image.flags |= ImageFlagBits::Alias; } - for (const ImageId aliased_id : left_aliased_ids) { + for (const ImageId aliased_id : join_left_aliased_ids) { ImageBase& aliased = slot_images[aliased_id]; - AddImageAlias(aliased, new_image_base, aliased_id, new_image_id); + size_t alias_index = new_image_base.aliased_images.size(); + if (!AddImageAlias(aliased, new_image_base, aliased_id, new_image_id)) { + continue; + } + join_alias_indices.emplace(aliased_id, alias_index); new_image.flags |= ImageFlagBits::Alias; } - for (const ImageId aliased_id : bad_overlap_ids) { + for (const ImageId aliased_id : join_bad_overlap_ids) { ImageBase& aliased = slot_images[aliased_id]; aliased.overlapping_images.push_back(new_image_id); new_image.overlapping_images.push_back(aliased_id); - new_image.flags |= ImageFlagBits::BadOverlap; + if (aliased.info.resources.levels == 1 && aliased.info.block.depth == 0 && + aliased.overlapping_images.size() > 1) { + aliased.flags |= ImageFlagBits::BadOverlap; + } + if (new_image.info.resources.levels == 1 && new_image.info.block.depth == 0 && + new_image.overlapping_images.size() > 1) { + new_image.flags |= ImageFlagBits::BadOverlap; + } + } + + for (const auto& copy_object : join_copies_to_do) { + Image& overlap = slot_images[copy_object.id]; + if (copy_object.is_alias) { + if (!overlap.IsSafeDownload()) { + continue; + } + const auto alias_pointer = join_alias_indices.find(copy_object.id); + if (alias_pointer == join_alias_indices.end()) { + continue; + } + const AliasedImage& aliased = new_image.aliased_images[alias_pointer->second]; + CopyImage(new_image_id, aliased.id, aliased.copies); + new_image.modification_tick = overlap.modification_tick; + continue; + } + if (True(overlap.flags & ImageFlagBits::GpuModified)) { + new_image.flags |= ImageFlagBits::GpuModified; + const auto& resolution = Settings::values.resolution_info; + const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value(); + const u32 up_scale = can_rescale ? resolution.up_scale : 1; + const u32 down_shift = can_rescale ? resolution.down_shift : 0; + auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift); + if (overlap.info.num_samples != new_image.info.num_samples) { + runtime.CopyImageMSAA(new_image, overlap, std::move(copies)); + } else { + runtime.CopyImage(new_image, overlap, std::move(copies)); + } + new_image.modification_tick = overlap.modification_tick; + } + if (True(overlap.flags & ImageFlagBits::Tracked)) { + UntrackImage(overlap, copy_object.id); + } + UnregisterImage(copy_object.id); + DeleteImage(copy_object.id); } + RegisterImage(new_image_id); return new_image_id; } @@ -1289,6 +1613,63 @@ std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImag } template <class P> +ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) { + std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info)); + if (!cpu_addr) { + return ImageId{}; + } + } + ImageId image_id{}; + boost::container::small_vector<ImageId, 8> image_ids; + const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { + if (True(existing_image.flags & ImageFlagBits::Remapped)) { + return false; + } + if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear) + [[unlikely]] { + const bool strict_size = True(existing_image.flags & ImageFlagBits::Strong); + const ImageInfo& existing = existing_image.info; + if (existing_image.gpu_addr == gpu_addr && existing.type == info.type && + existing.pitch == info.pitch && + IsPitchLinearSameSize(existing, info, strict_size) && + IsViewCompatible(existing.format, info.format, false, true)) { + image_id = existing_image_id; + image_ids.push_back(existing_image_id); + return true; + } + } else if (IsSubCopy(info, existing_image, gpu_addr)) { + image_id = existing_image_id; + image_ids.push_back(existing_image_id); + return true; + } + return false; + }; + ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda); + if (image_ids.size() <= 1) [[likely]] { + return image_id; + } + auto image_ids_compare = [this](ImageId a, ImageId b) { + auto& image_a = slot_images[a]; + auto& image_b = slot_images[b]; + return image_a.modification_tick < image_b.modification_tick; + }; + return *std::ranges::max_element(image_ids, image_ids_compare); +} + +template <class P> +std::pair<u32, u32> TextureCache<P>::PrepareDmaImage(ImageId dst_id, GPUVAddr base_addr, + bool mark_as_modified) { + const auto& image = slot_images[dst_id]; + const auto base = image.TryFindBase(base_addr); + PrepareImage(dst_id, mark_as_modified, false); + const auto& new_image = slot_images[dst_id]; + lru_cache.Touch(new_image.lru_index, frame_tick); + return std::make_pair(base->level, base->layer); +} + +template <class P> SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) { if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) { return NULL_SAMPLER_ID; @@ -1301,7 +1682,7 @@ SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) { } template <class P> -ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) { +ImageViewId TextureCache<P>::FindColorBuffer(size_t index) { const auto& regs = maxwell3d->regs; if (index >= regs.rt_control.count) { return ImageViewId{}; @@ -1314,12 +1695,12 @@ ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) { if (rt.format == Tegra::RenderTargetFormat::NONE) { return ImageViewId{}; } - const ImageInfo info(regs, index); - return FindRenderTargetView(info, gpu_addr, is_clear); + const ImageInfo info(regs.rt[index], regs.anti_alias_samples_mode); + return FindRenderTargetView(info, gpu_addr); } template <class P> -ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) { +ImageViewId TextureCache<P>::FindDepthBuffer() { const auto& regs = maxwell3d->regs; if (!regs.zeta_enable) { return ImageViewId{}; @@ -1328,19 +1709,17 @@ ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) { if (gpu_addr == 0) { return ImageViewId{}; } - const ImageInfo info(regs); - return FindRenderTargetView(info, gpu_addr, is_clear); + const ImageInfo info(regs.zeta, regs.zeta_size, regs.anti_alias_samples_mode); + return FindRenderTargetView(info, gpu_addr); } template <class P> -ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, - bool is_clear) { - const auto options = is_clear ? RelaxedOptions::Samples : RelaxedOptions{}; +ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr) { ImageId image_id{}; bool delete_state = has_deleted_images; do { has_deleted_images = false; - image_id = FindOrInsertImage(info, gpu_addr, options); + image_id = FindOrInsertImage(info, gpu_addr); delete_state |= has_deleted_images; } while (has_deleted_images); has_deleted_images = delete_state; @@ -1427,37 +1806,38 @@ void TextureCache<P>::ForEachImageInRegionGPU(size_t as_id, GPUVAddr gpu_addr, s return; } auto& gpu_page_table = gpu_page_table_storage[*storage_id]; - ForEachGPUPage(gpu_addr, size, [this, gpu_page_table, &images, gpu_addr, size, func](u64 page) { - const auto it = gpu_page_table.find(page); - if (it == gpu_page_table.end()) { - if constexpr (BOOL_BREAK) { - return false; - } else { - return; - } - } - for (const ImageId image_id : it->second) { - Image& image = slot_images[image_id]; - if (True(image.flags & ImageFlagBits::Picked)) { - continue; - } - if (!image.OverlapsGPU(gpu_addr, size)) { - continue; - } - image.flags |= ImageFlagBits::Picked; - images.push_back(image_id); - if constexpr (BOOL_BREAK) { - if (func(image_id, image)) { - return true; - } - } else { - func(image_id, image); - } - } - if constexpr (BOOL_BREAK) { - return false; - } - }); + ForEachGPUPage(gpu_addr, size, + [this, &gpu_page_table, &images, gpu_addr, size, func](u64 page) { + const auto it = gpu_page_table.find(page); + if (it == gpu_page_table.end()) { + if constexpr (BOOL_BREAK) { + return false; + } else { + return; + } + } + for (const ImageId image_id : it->second) { + Image& image = slot_images[image_id]; + if (True(image.flags & ImageFlagBits::Picked)) { + continue; + } + if (!image.OverlapsGPU(gpu_addr, size)) { + continue; + } + image.flags |= ImageFlagBits::Picked; + images.push_back(image_id); + if constexpr (BOOL_BREAK) { + if (func(image_id, image)) { + return true; + } + } else { + func(image_id, image); + } + } + if constexpr (BOOL_BREAK) { + return false; + } + }); for (const ImageId image_id : images) { slot_images[image_id].flags &= ~ImageFlagBits::Picked; } @@ -1549,10 +1929,6 @@ void TextureCache<P>::RegisterImage(ImageId image_id) { tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); } total_used_memory += Common::AlignUp(tentative_size, 1024); - if (total_used_memory > critical_memory && critical_gc < GC_EMERGENCY_COUNTS) { - RunGarbageCollector(); - critical_gc++; - } image.lru_index = lru_cache.Insert(image_id, frame_tick); ForEachGPUPage(image.gpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { @@ -1566,7 +1942,7 @@ void TextureCache<P>::RegisterImage(ImageId image_id) { image.map_view_id = map_id; return; } - std::vector<ImageViewId> sparse_maps{}; + boost::container::small_vector<ImageViewId, 16> sparse_maps; ForEachSparseSegment( image, [this, image_id, &sparse_maps](GPUVAddr gpu_addr, VAddr cpu_addr, size_t size) { auto map_id = slot_map_views.insert(gpu_addr, cpu_addr, size, image_id); @@ -1841,7 +2217,7 @@ void TextureCache<P>::MarkModification(ImageBase& image) noexcept { template <class P> void TextureCache<P>::SynchronizeAliases(ImageId image_id) { - boost::container::small_vector<const AliasedImage*, 1> aliased_images; + boost::container::small_vector<const AliasedImage*, 8> aliased_images; Image& image = slot_images[image_id]; bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled); bool any_modified = True(image.flags & ImageFlagBits::GpuModified); @@ -2019,7 +2395,8 @@ void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id) if (new_id) { const ImageViewBase& old_view = slot_image_views[new_id]; if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) { - uncommitted_downloads.push_back(old_view.image_id); + const PendingDownload new_download{true, 0, old_view.image_id}; + uncommitted_downloads.emplace_back(new_download); } } *old_id = new_id; |