1 files changed, 534 insertions, 157 deletions
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 1b01990a4..d3f03a995 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -1,9 +1,10 @@
-// SPDX-FileCopyrightText: 2021 yuzu Emulator Project
+// SPDX-FileCopyrightText: 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later
 
 #pragma once
 
 #include <unordered_set>
+#include <boost/container/small_vector.hpp>
 
 #include "common/alignment.h"
 #include "common/settings.h"
@@ -17,15 +18,10 @@
 
 namespace VideoCommon {
 
-using Tegra::Texture::SwizzleSource;
-using Tegra::Texture::TextureType;
 using Tegra::Texture::TICEntry;
 using Tegra::Texture::TSCEntry;
 using VideoCore::Surface::GetFormatType;
-using VideoCore::Surface::IsCopyCompatible;
 using VideoCore::Surface::PixelFormat;
-using VideoCore::Surface::PixelFormatFromDepthFormat;
-using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
 using VideoCore::Surface::SurfaceType;
 using namespace Common::Literals;
 
@@ -53,8 +49,8 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface&
 
     if constexpr (HAS_DEVICE_MEMORY_INFO) {
         const s64 device_memory = static_cast<s64>(runtime.GetDeviceLocalMemory());
-        const s64 min_spacing_expected = device_memory - 1_GiB - 512_MiB;
-        const s64 min_spacing_critical = device_memory - 1_GiB;
+        const s64 min_spacing_expected = device_memory - 1_GiB;
+        const s64 min_spacing_critical = device_memory - 512_MiB;
         const s64 mem_threshold = std::min(device_memory, TARGET_THRESHOLD);
         const s64 min_vacancy_expected = (6 * mem_threshold) / 10;
         const s64 min_vacancy_critical = (3 * mem_threshold) / 10;
@@ -85,10 +81,17 @@ void TextureCache<P>::RunGarbageCollector() {
         }
         --num_iterations;
         auto& image = slot_images[image_id];
+        if (True(image.flags & ImageFlagBits::IsDecoding)) {
+            // This image is still being decoded, deleting it will invalidate the slot
+            // used by the async decoder thread.
+            return false;
+        }
+        if (!aggressive_mode && True(image.flags & ImageFlagBits::CostlyLoad)) {
+            return false;
+        }
         const bool must_download =
             image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap);
-        if (!high_priority_mode &&
-            (must_download || True(image.flags & ImageFlagBits::CostlyLoad))) {
+        if (!high_priority_mode && must_download) {
             return false;
         }
         if (must_download) {
@@ -133,9 +136,17 @@ void TextureCache<P>::TickFrame() {
     sentenced_images.Tick();
     sentenced_framebuffers.Tick();
     sentenced_image_view.Tick();
+    TickAsyncDecode();
+
     runtime.TickFrame();
-    critical_gc = 0;
     ++frame_tick;
+
+    if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
+        for (auto& buffer : async_buffers_death_ring) {
+            runtime.FreeDeferredStagingBuffer(buffer);
+        }
+        async_buffers_death_ring.clear();
+    }
 }
 
 template <class P>
@@ -174,31 +185,91 @@ void TextureCache<P>::FillComputeImageViews(std::span<ImageViewInOut> views) {
 }
 
 template <class P>
+void TextureCache<P>::CheckFeedbackLoop(std::span<const ImageViewInOut> views) {
+    if (!Settings::values.barrier_feedback_loops.GetValue()) {
+        return;
+    }
+
+    const bool requires_barrier = [&] {
+        for (const auto& view : views) {
+            if (!view.id) {
+                continue;
+            }
+            auto& image_view = slot_image_views[view.id];
+
+            // Check color targets
+            for (const auto& ct_view_id : render_targets.color_buffer_ids) {
+                if (ct_view_id) {
+                    auto& ct_view = slot_image_views[ct_view_id];
+                    if (image_view.image_id == ct_view.image_id) {
+                        return true;
+                    }
+                }
+            }
+
+            // Check zeta target
+            if (render_targets.depth_buffer_id) {
+                auto& zt_view = slot_image_views[render_targets.depth_buffer_id];
+                if (image_view.image_id == zt_view.image_id) {
+                    return true;
+                }
+            }
+        }
+
+        return false;
+    }();
+
+    if (requires_barrier) {
+        runtime.BarrierFeedbackLoop();
+    }
+}
+
+template <class P>
 typename P::Sampler* TextureCache<P>::GetGraphicsSampler(u32 index) {
+    return &slot_samplers[GetGraphicsSamplerId(index)];
+}
+
+template <class P>
+typename P::Sampler* TextureCache<P>::GetComputeSampler(u32 index) {
+    return &slot_samplers[GetComputeSamplerId(index)];
+}
+
+template <class P>
+SamplerId TextureCache<P>::GetGraphicsSamplerId(u32 index) {
     if (index > channel_state->graphics_sampler_table.Limit()) {
         LOG_DEBUG(HW_GPU, "Invalid sampler index={}", index);
-        return &slot_samplers[NULL_SAMPLER_ID];
+        return NULL_SAMPLER_ID;
     }
     const auto [descriptor, is_new] = channel_state->graphics_sampler_table.Read(index);
     SamplerId& id = channel_state->graphics_sampler_ids[index];
     if (is_new) {
         id = FindSampler(descriptor);
     }
-    return &slot_samplers[id];
+    return id;
 }
 
 template <class P>
-typename P::Sampler* TextureCache<P>::GetComputeSampler(u32 index) {
+SamplerId TextureCache<P>::GetComputeSamplerId(u32 index) {
     if (index > channel_state->compute_sampler_table.Limit()) {
         LOG_DEBUG(HW_GPU, "Invalid sampler index={}", index);
-        return &slot_samplers[NULL_SAMPLER_ID];
+        return NULL_SAMPLER_ID;
     }
     const auto [descriptor, is_new] = channel_state->compute_sampler_table.Read(index);
     SamplerId& id = channel_state->compute_sampler_ids[index];
     if (is_new) {
         id = FindSampler(descriptor);
     }
-    return &slot_samplers[id];
+    return id;
+}
+
+template <class P>
+const typename P::Sampler& TextureCache<P>::GetSampler(SamplerId id) const noexcept {
+    return slot_samplers[id];
+}
+
+template <class P>
+typename P::Sampler& TextureCache<P>::GetSampler(SamplerId id) noexcept {
+    return slot_samplers[id];
 }
 
 template <class P>
@@ -233,7 +304,7 @@ void TextureCache<P>::SynchronizeComputeDescriptors() {
 }
 
 template <class P>
-bool TextureCache<P>::RescaleRenderTargets(bool is_clear) {
+bool TextureCache<P>::RescaleRenderTargets() {
     auto& flags = maxwell3d->dirty.flags;
     u32 scale_rating = 0;
     bool rescaled = false;
@@ -271,13 +342,13 @@ bool TextureCache<P>::RescaleRenderTargets(bool is_clear) {
             ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index];
             if (flags[Dirty::ColorBuffer0 + index] || force) {
                 flags[Dirty::ColorBuffer0 + index] = false;
-                BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear));
+                BindRenderTarget(&color_buffer_id, FindColorBuffer(index));
             }
             check_rescale(color_buffer_id, tmp_color_images[index]);
         }
         if (flags[Dirty::ZetaBuffer] || force) {
             flags[Dirty::ZetaBuffer] = false;
-            BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear));
+            BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer());
         }
         check_rescale(render_targets.depth_buffer_id, tmp_depth_image);
 
@@ -342,7 +413,7 @@ void TextureCache<P>::UpdateRenderTargets(bool is_clear) {
         return;
     }
 
-    const bool rescaled = RescaleRenderTargets(is_clear);
+    const bool rescaled = RescaleRenderTargets();
     if (is_rescaling != rescaled) {
         flags[Dirty::RescaleViewports] = true;
         flags[Dirty::RescaleScissors] = true;
@@ -455,7 +526,7 @@ void TextureCache<P>::WriteMemory(VAddr cpu_addr, size_t size) {
 
 template <class P>
 void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
-    std::vector<ImageId> images;
+    boost::container::small_vector<ImageId, 16> images;
     ForEachImageInRegion(cpu_addr, size, [&images](ImageId image_id, ImageBase& image) {
         if (!image.IsSafeDownload()) {
             return;
@@ -481,8 +552,34 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
 }
 
 template <class P>
+std::optional<VideoCore::RasterizerDownloadArea> TextureCache<P>::GetFlushArea(VAddr cpu_addr,
+                                                                               u64 size) {
+    std::optional<VideoCore::RasterizerDownloadArea> area{};
+    ForEachImageInRegion(cpu_addr, size, [&](ImageId, ImageBase& image) {
+        if (False(image.flags & ImageFlagBits::GpuModified)) {
+            return;
+        }
+        if (!area) {
+            area.emplace();
+            area->start_address = cpu_addr;
+            area->end_address = cpu_addr + size;
+            area->preemtive = true;
+        }
+        area->start_address = std::min(area->start_address, image.cpu_addr);
+        area->end_address = std::max(area->end_address, image.cpu_addr_end);
+        for (auto image_view_id : image.image_view_ids) {
+            auto& image_view = slot_image_views[image_view_id];
+            image_view.flags |= ImageViewFlagBits::PreemtiveDownload;
+        }
+        area->preemtive &= image.info.forced_flushed;
+        image.info.forced_flushed = true;
+    });
+    return area;
+}
+
+template <class P>
 void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) {
-    std::vector<ImageId> deleted_images;
+    boost::container::small_vector<ImageId, 16> deleted_images;
     ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); });
     for (const ImageId id : deleted_images) {
         Image& image = slot_images[id];
@@ -496,7 +593,7 @@ void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) {
 
 template <class P>
 void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) {
-    std::vector<ImageId> deleted_images;
+    boost::container::small_vector<ImageId, 16> deleted_images;
     ForEachImageInRegionGPU(as_id, gpu_addr, size,
                             [&](ImageId id, Image&) { deleted_images.push_back(id); });
     for (const ImageId id : deleted_images) {
@@ -654,25 +751,41 @@ template <class P>
 void TextureCache<P>::CommitAsyncFlushes() {
     // This is intentionally passing the value by copy
     if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
-        const std::span<const ImageId> download_ids = uncommitted_downloads;
+        auto& download_ids = uncommitted_downloads;
         if (download_ids.empty()) {
             committed_downloads.emplace_back(std::move(uncommitted_downloads));
             uncommitted_downloads.clear();
-            async_buffers.emplace_back(std::optional<AsyncBuffer>{});
+            async_buffers.emplace_back(std::move(uncommitted_async_buffers));
+            uncommitted_async_buffers.clear();
             return;
         }
         size_t total_size_bytes = 0;
-        for (const ImageId image_id : download_ids) {
-            total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
+        size_t last_async_buffer_id = uncommitted_async_buffers.size();
+        bool any_none_dma = false;
+        for (PendingDownload& download_info : download_ids) {
+            if (download_info.is_swizzle) {
+                total_size_bytes +=
+                    Common::AlignUp(slot_images[download_info.object_id].unswizzled_size_bytes, 64);
+                any_none_dma = true;
+                download_info.async_buffer_id = last_async_buffer_id;
+            }
         }
-        auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true);
-        for (const ImageId image_id : download_ids) {
-            Image& image = slot_images[image_id];
-            const auto copies = FullDownloadCopies(image.info);
-            image.DownloadMemory(download_map, copies);
-            download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64);
+
+        if (any_none_dma) {
+            auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true);
+            for (const PendingDownload& download_info : download_ids) {
+                if (download_info.is_swizzle) {
+                    Image& image = slot_images[download_info.object_id];
+                    const auto copies = FullDownloadCopies(image.info);
+                    image.DownloadMemory(download_map, copies);
+                    download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64);
+                }
+            }
+            uncommitted_async_buffers.emplace_back(download_map);
         }
-        async_buffers.emplace_back(download_map);
+
+        async_buffers.emplace_back(std::move(uncommitted_async_buffers));
+        uncommitted_async_buffers.clear();
     }
     committed_downloads.emplace_back(std::move(uncommitted_downloads));
     uncommitted_downloads.clear();
@@ -684,39 +797,57 @@ void TextureCache<P>::PopAsyncFlushes() {
         return;
     }
     if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
-        const std::span<const ImageId> download_ids = committed_downloads.front();
+        const auto& download_ids = committed_downloads.front();
         if (download_ids.empty()) {
             committed_downloads.pop_front();
             async_buffers.pop_front();
             return;
         }
-        auto download_map = *async_buffers.front();
-        std::span<u8> download_span = download_map.mapped_span;
+        auto download_map = std::move(async_buffers.front());
         for (size_t i = download_ids.size(); i > 0; i--) {
-            const ImageBase& image = slot_images[download_ids[i - 1]];
-            const auto copies = FullDownloadCopies(image.info);
-            download_map.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64);
-            std::span<u8> download_span_alt = download_span.subspan(download_map.offset);
-            SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span_alt,
-                         swizzle_data_buffer);
+            auto& download_info = download_ids[i - 1];
+            auto& download_buffer = download_map[download_info.async_buffer_id];
+            if (download_info.is_swizzle) {
+                const ImageBase& image = slot_images[download_info.object_id];
+                const auto copies = FullDownloadCopies(image.info);
+                download_buffer.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64);
+                std::span<u8> download_span =
+                    download_buffer.mapped_span.subspan(download_buffer.offset);
+                SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span,
+                             swizzle_data_buffer);
+            } else {
+                const BufferDownload& buffer_info = slot_buffer_downloads[download_info.object_id];
+                std::span<u8> download_span =
+                    download_buffer.mapped_span.subspan(download_buffer.offset);
+                gpu_memory->WriteBlockUnsafe(buffer_info.address, download_span.data(),
+                                             buffer_info.size);
+                slot_buffer_downloads.erase(download_info.object_id);
+            }
+        }
+        for (auto& download_buffer : download_map) {
+            async_buffers_death_ring.emplace_back(download_buffer);
         }
-        runtime.FreeDeferredStagingBuffer(download_map);
         committed_downloads.pop_front();
         async_buffers.pop_front();
     } else {
-        const std::span<const ImageId> download_ids = committed_downloads.front();
+        const auto& download_ids = committed_downloads.front();
         if (download_ids.empty()) {
             committed_downloads.pop_front();
             return;
         }
         size_t total_size_bytes = 0;
-        for (const ImageId image_id : download_ids) {
-            total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
+        for (const PendingDownload& download_info : download_ids) {
+            if (download_info.is_swizzle) {
+                total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes;
+            }
         }
         auto download_map = runtime.DownloadStagingBuffer(total_size_bytes);
         const size_t original_offset = download_map.offset;
-        for (const ImageId image_id : download_ids) {
-            Image& image = slot_images[image_id];
+        for (const PendingDownload& download_info : download_ids) {
+            if (!download_info.is_swizzle) {
+                continue;
+            }
+            Image& image = slot_images[download_info.object_id];
             const auto copies = FullDownloadCopies(image.info);
             image.DownloadMemory(download_map, copies);
             download_map.offset += image.unswizzled_size_bytes;
@@ -725,8 +856,11 @@ void TextureCache<P>::PopAsyncFlushes() {
         runtime.Finish();
         download_map.offset = original_offset;
         std::span<u8> download_span = download_map.mapped_span;
-        for (const ImageId image_id : download_ids) {
-            const ImageBase& image = slot_images[image_id];
+        for (const PendingDownload& download_info : download_ids) {
+            if (!download_info.is_swizzle) {
+                continue;
+            }
+            const ImageBase& image = slot_images[download_info.object_id];
             const auto copies = FullDownloadCopies(image.info);
             SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span,
                          swizzle_data_buffer);
@@ -738,6 +872,26 @@ void TextureCache<P>::PopAsyncFlushes() {
 }
 
 template <class P>
+ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand, bool is_upload) {
+    const ImageInfo dst_info(operand);
+    const ImageId image_id = FindDMAImage(dst_info, operand.address);
+    if (!image_id) {
+        return NULL_IMAGE_ID;
+    }
+    auto& image = slot_images[image_id];
+    if (!is_upload && !image.info.dma_downloaded) {
+        // Force a full sync.
+        image.info.dma_downloaded = true;
+        return NULL_IMAGE_ID;
+    }
+    const auto base = image.TryFindBase(operand.address);
+    if (!base) {
+        return NULL_IMAGE_ID;
+    }
+    return image_id;
+}
+
+template <class P>
 bool TextureCache<P>::IsRescaling() const noexcept {
     return is_rescaling;
 }
@@ -765,6 +919,76 @@ bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
 }
 
 template <class P>
+std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::DmaBufferImageCopy(
+    const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand,
+    const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image) {
+    const auto [level, base] = PrepareDmaImage(image_id, image_operand.address, modifies_image);
+    auto* image = &slot_images[image_id];
+    const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
+    const u32 bpp = VideoCore::Surface::BytesPerBlock(image->info.format);
+    const auto convert = [old_bpp = image_operand.bytes_per_pixel, bpp](u32 value) {
+        return (old_bpp * value) / bpp;
+    };
+    const u32 base_x = convert(image_operand.params.origin.x.Value());
+    const u32 base_y = image_operand.params.origin.y.Value();
+    const u32 length_x = convert(copy_info.length_x);
+    const u32 length_y = copy_info.length_y;
+
+    const BufferImageCopy copy{
+        .buffer_offset = 0,
+        .buffer_size = buffer_size,
+        .buffer_row_length = convert(buffer_operand.pitch),
+        .buffer_image_height = buffer_operand.height,
+        .image_subresource =
+            {
+                .base_level = static_cast<s32>(level),
+                .base_layer = static_cast<s32>(base),
+                .num_layers = 1,
+            },
+        .image_offset =
+            {
+                .x = static_cast<s32>(base_x),
+                .y = static_cast<s32>(base_y),
+                .z = 0,
+            },
+        .image_extent =
+            {
+                .width = length_x,
+                .height = length_y,
+                .depth = 1,
+            },
+    };
+    return {image, copy};
+}
+
+template <class P>
+void TextureCache<P>::DownloadImageIntoBuffer(typename TextureCache<P>::Image* image,
+                                              typename TextureCache<P>::BufferType buffer,
+                                              size_t buffer_offset,
+                                              std::span<const VideoCommon::BufferImageCopy> copies,
+                                              GPUVAddr address, size_t size) {
+    if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
+        const BufferDownload new_buffer_download{address, size};
+        auto slot = slot_buffer_downloads.insert(new_buffer_download);
+        const PendingDownload new_download{false, uncommitted_async_buffers.size(), slot};
+        uncommitted_downloads.emplace_back(new_download);
+        auto download_map = runtime.DownloadStagingBuffer(size, true);
+        uncommitted_async_buffers.emplace_back(download_map);
+        std::array buffers{
+            buffer,
+            download_map.buffer,
+        };
+        std::array<u64, 2> buffer_offsets{
+            buffer_offset,
+            download_map.offset,
+        };
+        image->DownloadMemory(buffers, buffer_offsets, copies);
+    } else {
+        image->DownloadMemory(buffer, buffer_offset, copies);
+    }
+}
+
+template <class P>
 void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
     if (False(image.flags & ImageFlagBits::CpuModified)) {
         // Only upload modified images
@@ -773,10 +997,14 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
     image.flags &= ~ImageFlagBits::CpuModified;
     TrackImage(image, image_id);
 
-    if (image.info.num_samples > 1) {
+    if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
         LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
         return;
     }
+    if (True(image.flags & ImageFlagBits::AsynchronousDecode)) {
+        QueueAsyncDecode(image, image_id);
+        return;
+    }
     auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
     UploadImageContents(image, staging);
     runtime.InsertUploadMemoryBarrier();
@@ -873,7 +1101,7 @@ ImageId TextureCache<P>::FindImage(const ImageInfo& info, GPUVAddr gpu_addr,
     const bool native_bgr = runtime.HasNativeBgr();
     const bool flexible_formats = True(options & RelaxedOptions::Format);
     ImageId image_id{};
-    boost::container::small_vector<ImageId, 1> image_ids;
+    boost::container::small_vector<ImageId, 8> image_ids;
     const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) {
         if (True(existing_image.flags & ImageFlagBits::Remapped)) {
             return false;
@@ -990,6 +1218,65 @@ u64 TextureCache<P>::GetScaledImageSizeBytes(const ImageBase& image) {
 }
 
 template <class P>
+void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
+    UNIMPLEMENTED_IF(False(image.flags & ImageFlagBits::Converted));
+    LOG_INFO(HW_GPU, "Queuing async texture decode");
+
+    image.flags |= ImageFlagBits::IsDecoding;
+    auto decode = std::make_unique<AsyncDecodeContext>();
+    auto* decode_ptr = decode.get();
+    decode->image_id = image_id;
+    async_decodes.push_back(std::move(decode));
+
+    Common::ScratchBuffer<u8> local_unswizzle_data_buffer(image.unswizzled_size_bytes);
+    const size_t guest_size_bytes = image.guest_size_bytes;
+    swizzle_data_buffer.resize_destructive(guest_size_bytes);
+    gpu_memory->ReadBlockUnsafe(image.gpu_addr, swizzle_data_buffer.data(), guest_size_bytes);
+    auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data_buffer,
+                                 local_unswizzle_data_buffer);
+    const size_t out_size = MapSizeBytes(image);
+
+    auto func = [out_size, copies, info = image.info,
+                 input = std::move(local_unswizzle_data_buffer),
+                 async_decode = decode_ptr]() mutable {
+        async_decode->decoded_data.resize_destructive(out_size);
+        std::span copies_span{copies.data(), copies.size()};
+        ConvertImage(input, info, async_decode->decoded_data, copies_span);
+
+        // TODO: Do we need this lock?
+        std::unique_lock lock{async_decode->mutex};
+        async_decode->copies = std::move(copies);
+        async_decode->complete = true;
+    };
+    texture_decode_worker.QueueWork(std::move(func));
+}
+
+template <class P>
+void TextureCache<P>::TickAsyncDecode() {
+    bool has_uploads{};
+    auto i = async_decodes.begin();
+    while (i != async_decodes.end()) {
+        auto* async_decode = i->get();
+        std::unique_lock lock{async_decode->mutex};
+        if (!async_decode->complete) {
+            ++i;
+            continue;
+        }
+        Image& image = slot_images[async_decode->image_id];
+        auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
+        std::memcpy(staging.mapped_span.data(), async_decode->decoded_data.data(),
+                    async_decode->decoded_data.size());
+        image.UploadMemory(staging, async_decode->copies);
+        image.flags &= ~ImageFlagBits::IsDecoding;
+        has_uploads = true;
+        i = async_decodes.erase(i);
+    }
+    if (has_uploads) {
+        runtime.InsertUploadMemoryBarrier();
+    }
+}
+
+template <class P>
 bool TextureCache<P>::ScaleUp(Image& image) {
     const bool has_copy = image.HasScaled();
     const bool rescaled = image.ScaleUp();
@@ -1044,17 +1331,18 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
     const size_t size_bytes = CalculateGuestSizeInBytes(new_info);
     const bool broken_views = runtime.HasBrokenTextureViewFormats();
     const bool native_bgr = runtime.HasNativeBgr();
-    std::vector<ImageId> overlap_ids;
-    std::unordered_set<ImageId> overlaps_found;
-    std::vector<ImageId> left_aliased_ids;
-    std::vector<ImageId> right_aliased_ids;
-    std::unordered_set<ImageId> ignore_textures;
-    std::vector<ImageId> bad_overlap_ids;
-    std::vector<ImageId> all_siblings;
+    join_overlap_ids.clear();
+    join_overlaps_found.clear();
+    join_left_aliased_ids.clear();
+    join_right_aliased_ids.clear();
+    join_ignore_textures.clear();
+    join_bad_overlap_ids.clear();
+    join_copies_to_do.clear();
+    join_alias_indices.clear();
     const bool this_is_linear = info.type == ImageType::Linear;
     const auto region_check = [&](ImageId overlap_id, ImageBase& overlap) {
         if (True(overlap.flags & ImageFlagBits::Remapped)) {
-            ignore_textures.insert(overlap_id);
+            join_ignore_textures.insert(overlap_id);
             return;
         }
         const bool overlap_is_linear = overlap.info.type == ImageType::Linear;
@@ -1064,11 +1352,11 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
         if (this_is_linear && overlap_is_linear) {
             if (info.pitch == overlap.info.pitch && gpu_addr == overlap.gpu_addr) {
                 // Alias linear images with the same pitch
-                left_aliased_ids.push_back(overlap_id);
+                join_left_aliased_ids.push_back(overlap_id);
             }
             return;
         }
-        overlaps_found.insert(overlap_id);
+        join_overlaps_found.insert(overlap_id);
         static constexpr bool strict_size = true;
         const std::optional<OverlapResult> solution = ResolveOverlap(
             new_info, gpu_addr, cpu_addr, overlap, strict_size, broken_views, native_bgr);
@@ -1076,34 +1364,33 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
             gpu_addr = solution->gpu_addr;
             cpu_addr = solution->cpu_addr;
             new_info.resources = solution->resources;
-            overlap_ids.push_back(overlap_id);
-            all_siblings.push_back(overlap_id);
+            join_overlap_ids.push_back(overlap_id);
+            join_copies_to_do.emplace_back(JoinCopy{false, overlap_id});
             return;
         }
         static constexpr auto options = RelaxedOptions::Size | RelaxedOptions::Format;
         const ImageBase new_image_base(new_info, gpu_addr, cpu_addr);
         if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) {
-            left_aliased_ids.push_back(overlap_id);
+            join_left_aliased_ids.push_back(overlap_id);
             overlap.flags |= ImageFlagBits::Alias;
-            all_siblings.push_back(overlap_id);
+            join_copies_to_do.emplace_back(JoinCopy{true, overlap_id});
         } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options,
                                  broken_views, native_bgr)) {
-            right_aliased_ids.push_back(overlap_id);
+            join_right_aliased_ids.push_back(overlap_id);
             overlap.flags |= ImageFlagBits::Alias;
-            all_siblings.push_back(overlap_id);
+            join_copies_to_do.emplace_back(JoinCopy{true, overlap_id});
         } else {
-            bad_overlap_ids.push_back(overlap_id);
-            overlap.flags |= ImageFlagBits::BadOverlap;
+            join_bad_overlap_ids.push_back(overlap_id);
         }
     };
     ForEachImageInRegion(cpu_addr, size_bytes, region_check);
     const auto region_check_gpu = [&](ImageId overlap_id, ImageBase& overlap) {
-        if (!overlaps_found.contains(overlap_id)) {
+        if (!join_overlaps_found.contains(overlap_id)) {
             if (True(overlap.flags & ImageFlagBits::Remapped)) {
-                ignore_textures.insert(overlap_id);
+                join_ignore_textures.insert(overlap_id);
             }
             if (overlap.gpu_addr == gpu_addr && overlap.guest_size_bytes == size_bytes) {
-                ignore_textures.insert(overlap_id);
+                join_ignore_textures.insert(overlap_id);
             }
         }
     };
@@ -1111,11 +1398,11 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
 
     bool can_rescale = info.rescaleable;
     bool any_rescaled = false;
-    for (const ImageId sibling_id : all_siblings) {
+    for (const auto& copy : join_copies_to_do) {
         if (!can_rescale) {
             break;
         }
-        Image& sibling = slot_images[sibling_id];
+        Image& sibling = slot_images[copy.id];
         can_rescale &= ImageCanRescale(sibling);
         any_rescaled |= True(sibling.flags & ImageFlagBits::Rescaled);
     }
@@ -1123,13 +1410,13 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
     can_rescale &= any_rescaled;
 
     if (can_rescale) {
-        for (const ImageId sibling_id : all_siblings) {
-            Image& sibling = slot_images[sibling_id];
+        for (const auto& copy : join_copies_to_do) {
+            Image& sibling = slot_images[copy.id];
             ScaleUp(sibling);
         }
     } else {
-        for (const ImageId sibling_id : all_siblings) {
-            Image& sibling = slot_images[sibling_id];
+        for (const auto& copy : join_copies_to_do) {
+            Image& sibling = slot_images[copy.id];
             ScaleDown(sibling);
         }
     }
@@ -1137,11 +1424,11 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
     const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
     Image& new_image = slot_images[new_image_id];
 
-    if (!gpu_memory->IsContinousRange(new_image.gpu_addr, new_image.guest_size_bytes)) {
+    if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes)) {
         new_image.flags |= ImageFlagBits::Sparse;
     }
 
-    for (const ImageId overlap_id : ignore_textures) {
+    for (const ImageId overlap_id : join_ignore_textures) {
         Image& overlap = slot_images[overlap_id];
         if (True(overlap.flags & ImageFlagBits::GpuModified)) {
             UNIMPLEMENTED();
@@ -1162,44 +1449,81 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
         ScaleDown(new_image);
     }
 
-    for (const ImageId overlap_id : overlap_ids) {
-        Image& overlap = slot_images[overlap_id];
-        if (True(overlap.flags & ImageFlagBits::GpuModified)) {
-            new_image.flags |= ImageFlagBits::GpuModified;
-        }
-        if (overlap.info.num_samples != new_image.info.num_samples) {
-            LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented");
-        } else {
-            const auto& resolution = Settings::values.resolution_info;
-            const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value();
-            const u32 up_scale = can_rescale ? resolution.up_scale : 1;
-            const u32 down_shift = can_rescale ? resolution.down_shift : 0;
-            auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift);
-            runtime.CopyImage(new_image, overlap, std::move(copies));
-        }
-        if (True(overlap.flags & ImageFlagBits::Tracked)) {
-            UntrackImage(overlap, overlap_id);
-        }
-        UnregisterImage(overlap_id);
-        DeleteImage(overlap_id);
-    }
+    std::ranges::sort(join_copies_to_do, [this](const JoinCopy& lhs, const JoinCopy& rhs) {
+        const ImageBase& lhs_image = slot_images[lhs.id];
+        const ImageBase& rhs_image = slot_images[rhs.id];
+        return lhs_image.modification_tick < rhs_image.modification_tick;
+    });
+
     ImageBase& new_image_base = new_image;
-    for (const ImageId aliased_id : right_aliased_ids) {
+    for (const ImageId aliased_id : join_right_aliased_ids) {
         ImageBase& aliased = slot_images[aliased_id];
-        AddImageAlias(new_image_base, aliased, new_image_id, aliased_id);
+        size_t alias_index = new_image_base.aliased_images.size();
+        if (!AddImageAlias(new_image_base, aliased, new_image_id, aliased_id)) {
+            continue;
+        }
+        join_alias_indices.emplace(aliased_id, alias_index);
         new_image.flags |= ImageFlagBits::Alias;
     }
-    for (const ImageId aliased_id : left_aliased_ids) {
+    for (const ImageId aliased_id : join_left_aliased_ids) {
         ImageBase& aliased = slot_images[aliased_id];
-        AddImageAlias(aliased, new_image_base, aliased_id, new_image_id);
+        size_t alias_index = new_image_base.aliased_images.size();
+        if (!AddImageAlias(aliased, new_image_base, aliased_id, new_image_id)) {
+            continue;
+        }
+        join_alias_indices.emplace(aliased_id, alias_index);
         new_image.flags |= ImageFlagBits::Alias;
     }
-    for (const ImageId aliased_id : bad_overlap_ids) {
+    for (const ImageId aliased_id : join_bad_overlap_ids) {
         ImageBase& aliased = slot_images[aliased_id];
         aliased.overlapping_images.push_back(new_image_id);
         new_image.overlapping_images.push_back(aliased_id);
-        new_image.flags |= ImageFlagBits::BadOverlap;
+        if (aliased.info.resources.levels == 1 && aliased.info.block.depth == 0 &&
+            aliased.overlapping_images.size() > 1) {
+            aliased.flags |= ImageFlagBits::BadOverlap;
+        }
+        if (new_image.info.resources.levels == 1 && new_image.info.block.depth == 0 &&
+            new_image.overlapping_images.size() > 1) {
+            new_image.flags |= ImageFlagBits::BadOverlap;
+        }
+    }
+
+    for (const auto& copy_object : join_copies_to_do) {
+        Image& overlap = slot_images[copy_object.id];
+        if (copy_object.is_alias) {
+            if (!overlap.IsSafeDownload()) {
+                continue;
+            }
+            const auto alias_pointer = join_alias_indices.find(copy_object.id);
+            if (alias_pointer == join_alias_indices.end()) {
+                continue;
+            }
+            const AliasedImage& aliased = new_image.aliased_images[alias_pointer->second];
+            CopyImage(new_image_id, aliased.id, aliased.copies);
+            new_image.modification_tick = overlap.modification_tick;
+            continue;
+        }
+        if (True(overlap.flags & ImageFlagBits::GpuModified)) {
+            new_image.flags |= ImageFlagBits::GpuModified;
+            const auto& resolution = Settings::values.resolution_info;
+            const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value();
+            const u32 up_scale = can_rescale ? resolution.up_scale : 1;
+            const u32 down_shift = can_rescale ? resolution.down_shift : 0;
+            auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift);
+            if (overlap.info.num_samples != new_image.info.num_samples) {
+                runtime.CopyImageMSAA(new_image, overlap, std::move(copies));
+            } else {
+                runtime.CopyImage(new_image, overlap, std::move(copies));
+            }
+            new_image.modification_tick = overlap.modification_tick;
+        }
+        if (True(overlap.flags & ImageFlagBits::Tracked)) {
+            UntrackImage(overlap, copy_object.id);
+        }
+        UnregisterImage(copy_object.id);
+        DeleteImage(copy_object.id);
     }
+
     RegisterImage(new_image_id);
     return new_image_id;
 }
@@ -1289,6 +1613,63 @@ std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImag
 }
 
 template <class P>
+ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) {
+    std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+    if (!cpu_addr) {
+        cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info));
+        if (!cpu_addr) {
+            return ImageId{};
+        }
+    }
+    ImageId image_id{};
+    boost::container::small_vector<ImageId, 8> image_ids;
+    const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) {
+        if (True(existing_image.flags & ImageFlagBits::Remapped)) {
+            return false;
+        }
+        if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear)
+            [[unlikely]] {
+            const bool strict_size = True(existing_image.flags & ImageFlagBits::Strong);
+            const ImageInfo& existing = existing_image.info;
+            if (existing_image.gpu_addr == gpu_addr && existing.type == info.type &&
+                existing.pitch == info.pitch &&
+                IsPitchLinearSameSize(existing, info, strict_size) &&
+                IsViewCompatible(existing.format, info.format, false, true)) {
+                image_id = existing_image_id;
+                image_ids.push_back(existing_image_id);
+                return true;
+            }
+        } else if (IsSubCopy(info, existing_image, gpu_addr)) {
+            image_id = existing_image_id;
+            image_ids.push_back(existing_image_id);
+            return true;
+        }
+        return false;
+    };
+    ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda);
+    if (image_ids.size() <= 1) [[likely]] {
+        return image_id;
+    }
+    auto image_ids_compare = [this](ImageId a, ImageId b) {
+        auto& image_a = slot_images[a];
+        auto& image_b = slot_images[b];
+        return image_a.modification_tick < image_b.modification_tick;
+    };
+    return *std::ranges::max_element(image_ids, image_ids_compare);
+}
+
+template <class P>
+std::pair<u32, u32> TextureCache<P>::PrepareDmaImage(ImageId dst_id, GPUVAddr base_addr,
+                                                     bool mark_as_modified) {
+    const auto& image = slot_images[dst_id];
+    const auto base = image.TryFindBase(base_addr);
+    PrepareImage(dst_id, mark_as_modified, false);
+    const auto& new_image = slot_images[dst_id];
+    lru_cache.Touch(new_image.lru_index, frame_tick);
+    return std::make_pair(base->level, base->layer);
+}
+
+template <class P>
 SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) {
     if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) {
         return NULL_SAMPLER_ID;
@@ -1301,7 +1682,7 @@ SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) {
 }
 
 template <class P>
-ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) {
+ImageViewId TextureCache<P>::FindColorBuffer(size_t index) {
     const auto& regs = maxwell3d->regs;
     if (index >= regs.rt_control.count) {
         return ImageViewId{};
@@ -1314,12 +1695,12 @@ ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) {
     if (rt.format == Tegra::RenderTargetFormat::NONE) {
         return ImageViewId{};
     }
-    const ImageInfo info(regs, index);
-    return FindRenderTargetView(info, gpu_addr, is_clear);
+    const ImageInfo info(regs.rt[index], regs.anti_alias_samples_mode);
+    return FindRenderTargetView(info, gpu_addr);
 }
 
 template <class P>
-ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) {
+ImageViewId TextureCache<P>::FindDepthBuffer() {
     const auto& regs = maxwell3d->regs;
     if (!regs.zeta_enable) {
         return ImageViewId{};
@@ -1328,19 +1709,17 @@ ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) {
     if (gpu_addr == 0) {
         return ImageViewId{};
     }
-    const ImageInfo info(regs);
-    return FindRenderTargetView(info, gpu_addr, is_clear);
+    const ImageInfo info(regs.zeta, regs.zeta_size, regs.anti_alias_samples_mode);
+    return FindRenderTargetView(info, gpu_addr);
 }
 
 template <class P>
-ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr,
-                                                  bool is_clear) {
-    const auto options = is_clear ? RelaxedOptions::Samples : RelaxedOptions{};
+ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr) {
     ImageId image_id{};
     bool delete_state = has_deleted_images;
     do {
         has_deleted_images = false;
-        image_id = FindOrInsertImage(info, gpu_addr, options);
+        image_id = FindOrInsertImage(info, gpu_addr);
         delete_state |= has_deleted_images;
     } while (has_deleted_images);
     has_deleted_images = delete_state;
@@ -1427,37 +1806,38 @@ void TextureCache<P>::ForEachImageInRegionGPU(size_t as_id, GPUVAddr gpu_addr, s
         return;
     }
     auto& gpu_page_table = gpu_page_table_storage[*storage_id];
-    ForEachGPUPage(gpu_addr, size, [this, gpu_page_table, &images, gpu_addr, size, func](u64 page) {
-        const auto it = gpu_page_table.find(page);
-        if (it == gpu_page_table.end()) {
-            if constexpr (BOOL_BREAK) {
-                return false;
-            } else {
-                return;
-            }
-        }
-        for (const ImageId image_id : it->second) {
-            Image& image = slot_images[image_id];
-            if (True(image.flags & ImageFlagBits::Picked)) {
-                continue;
-            }
-            if (!image.OverlapsGPU(gpu_addr, size)) {
-                continue;
-            }
-            image.flags |= ImageFlagBits::Picked;
-            images.push_back(image_id);
-            if constexpr (BOOL_BREAK) {
-                if (func(image_id, image)) {
-                    return true;
-                }
-            } else {
-                func(image_id, image);
-            }
-        }
-        if constexpr (BOOL_BREAK) {
-            return false;
-        }
-    });
+    ForEachGPUPage(gpu_addr, size,
+                   [this, &gpu_page_table, &images, gpu_addr, size, func](u64 page) {
+                       const auto it = gpu_page_table.find(page);
+                       if (it == gpu_page_table.end()) {
+                           if constexpr (BOOL_BREAK) {
+                               return false;
+                           } else {
+                               return;
+                           }
+                       }
+                       for (const ImageId image_id : it->second) {
+                           Image& image = slot_images[image_id];
+                           if (True(image.flags & ImageFlagBits::Picked)) {
+                               continue;
+                           }
+                           if (!image.OverlapsGPU(gpu_addr, size)) {
+                               continue;
+                           }
+                           image.flags |= ImageFlagBits::Picked;
+                           images.push_back(image_id);
+                           if constexpr (BOOL_BREAK) {
+                               if (func(image_id, image)) {
+                                   return true;
+                               }
+                           } else {
+                               func(image_id, image);
+                           }
+                       }
+                       if constexpr (BOOL_BREAK) {
+                           return false;
+                       }
+                   });
     for (const ImageId image_id : images) {
         slot_images[image_id].flags &= ~ImageFlagBits::Picked;
     }
@@ -1549,10 +1929,6 @@ void TextureCache<P>::RegisterImage(ImageId image_id) {
         tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
     }
     total_used_memory += Common::AlignUp(tentative_size, 1024);
-    if (total_used_memory > critical_memory && critical_gc < GC_EMERGENCY_COUNTS) {
-        RunGarbageCollector();
-        critical_gc++;
-    }
     image.lru_index = lru_cache.Insert(image_id, frame_tick);
 
     ForEachGPUPage(image.gpu_addr, image.guest_size_bytes, [this, image_id](u64 page) {
@@ -1566,7 +1942,7 @@ void TextureCache<P>::RegisterImage(ImageId image_id) {
         image.map_view_id = map_id;
         return;
     }
-    std::vector<ImageViewId> sparse_maps{};
+    boost::container::small_vector<ImageViewId, 16> sparse_maps;
     ForEachSparseSegment(
         image, [this, image_id, &sparse_maps](GPUVAddr gpu_addr, VAddr cpu_addr, size_t size) {
             auto map_id = slot_map_views.insert(gpu_addr, cpu_addr, size, image_id);
@@ -1841,7 +2217,7 @@ void TextureCache<P>::MarkModification(ImageBase& image) noexcept {
 
 template <class P>
 void TextureCache<P>::SynchronizeAliases(ImageId image_id) {
-    boost::container::small_vector<const AliasedImage*, 1> aliased_images;
+    boost::container::small_vector<const AliasedImage*, 8> aliased_images;
     Image& image = slot_images[image_id];
     bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled);
     bool any_modified = True(image.flags & ImageFlagBits::GpuModified);
@@ -2019,7 +2395,8 @@ void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id)
     if (new_id) {
         const ImageViewBase& old_view = slot_image_views[new_id];
         if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) {
-            uncommitted_downloads.push_back(old_view.image_id);
+            const PendingDownload new_download{true, 0, old_view.image_id};
+            uncommitted_downloads.emplace_back(new_download);
         }
     }
     *old_id = new_id;