1 files changed, 139 insertions, 0 deletions
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 3e2cbb0b0..335338434 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -85,6 +85,11 @@ void TextureCache<P>::RunGarbageCollector() {
         }
         --num_iterations;
         auto& image = slot_images[image_id];
+        if (True(image.flags & ImageFlagBits::IsDecoding)) {
+            // This image is still being decoded, deleting it will invalidate the slot
+            // used by the async decoder thread.
+            return false;
+        }
         const bool must_download =
             image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap);
         if (!high_priority_mode &&
@@ -133,6 +138,8 @@ void TextureCache<P>::TickFrame() {
     sentenced_images.Tick();
     sentenced_framebuffers.Tick();
     sentenced_image_view.Tick();
+    TickAsyncDecode();
+
     runtime.TickFrame();
     critical_gc = 0;
     ++frame_tick;
@@ -777,6 +784,10 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
         LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
         return;
     }
+    if (True(image.flags & ImageFlagBits::AsynchronousDecode)) {
+        QueueAsyncDecode(image, image_id);
+        return;
+    }
     auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
     UploadImageContents(image, staging);
     runtime.InsertUploadMemoryBarrier();
@@ -990,6 +1001,65 @@ u64 TextureCache<P>::GetScaledImageSizeBytes(const ImageBase& image) {
 }
 
 template <class P>
+void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
+    UNIMPLEMENTED_IF(False(image.flags & ImageFlagBits::Converted));
+    LOG_INFO(HW_GPU, "Queuing async texture decode");
+
+    image.flags |= ImageFlagBits::IsDecoding;
+    auto decode = std::make_unique<AsyncDecodeContext>();
+    auto* decode_ptr = decode.get();
+    decode->image_id = image_id;
+    async_decodes.push_back(std::move(decode));
+
+    Common::ScratchBuffer<u8> local_unswizzle_data_buffer(image.unswizzled_size_bytes);
+    const size_t guest_size_bytes = image.guest_size_bytes;
+    swizzle_data_buffer.resize_destructive(guest_size_bytes);
+    gpu_memory->ReadBlockUnsafe(image.gpu_addr, swizzle_data_buffer.data(), guest_size_bytes);
+    auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data_buffer,
+                                 local_unswizzle_data_buffer);
+    const size_t out_size = MapSizeBytes(image);
+
+    auto func = [out_size, copies, info = image.info,
+                 input = std::move(local_unswizzle_data_buffer),
+                 async_decode = decode_ptr]() mutable {
+        async_decode->decoded_data.resize_destructive(out_size);
+        std::span copies_span{copies.data(), copies.size()};
+        ConvertImage(input, info, async_decode->decoded_data, copies_span);
+
+        // TODO: Do we need this lock?
+        std::unique_lock lock{async_decode->mutex};
+        async_decode->copies = std::move(copies);
+        async_decode->complete = true;
+    };
+    texture_decode_worker.QueueWork(std::move(func));
+}
+
+template <class P>
+void TextureCache<P>::TickAsyncDecode() {
+    bool has_uploads{};
+    auto i = async_decodes.begin();
+    while (i != async_decodes.end()) {
+        auto* async_decode = i->get();
+        std::unique_lock lock{async_decode->mutex};
+        if (!async_decode->complete) {
+            ++i;
+            continue;
+        }
+        Image& image = slot_images[async_decode->image_id];
+        auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
+        std::memcpy(staging.mapped_span.data(), async_decode->decoded_data.data(),
+                    async_decode->decoded_data.size());
+        image.UploadMemory(staging, async_decode->copies);
+        image.flags &= ~ImageFlagBits::IsDecoding;
+        has_uploads = true;
+        i = async_decodes.erase(i);
+    }
+    if (has_uploads) {
+        runtime.InsertUploadMemoryBarrier();
+    }
+}
+
+template <class P>
 bool TextureCache<P>::ScaleUp(Image& image) {
     const bool has_copy = image.HasScaled();
     const bool rescaled = image.ScaleUp();
@@ -1289,6 +1359,75 @@ std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImag
 }
 
 template <class P>
+ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) {
+    std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+    if (!cpu_addr) {
+        cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info));
+        if (!cpu_addr) {
+            return ImageId{};
+        }
+    }
+    ImageId image_id{};
+    boost::container::small_vector<ImageId, 1> image_ids;
+    const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) {
+        if (True(existing_image.flags & ImageFlagBits::Remapped)) {
+            return false;
+        }
+        if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear)
+            [[unlikely]] {
+            const bool strict_size = True(existing_image.flags & ImageFlagBits::Strong);
+            const ImageInfo& existing = existing_image.info;
+            if (existing_image.gpu_addr == gpu_addr && existing.type == info.type &&
+                existing.pitch == info.pitch &&
+                IsPitchLinearSameSize(existing, info, strict_size) &&
+                IsViewCompatible(existing.format, info.format, false, true)) {
+                image_id = existing_image_id;
+                image_ids.push_back(existing_image_id);
+                return true;
+            }
+        } else if (IsSubCopy(info, existing_image, gpu_addr)) {
+            image_id = existing_image_id;
+            image_ids.push_back(existing_image_id);
+            return true;
+        }
+        return false;
+    };
+    ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda);
+    if (image_ids.size() <= 1) [[likely]] {
+        return image_id;
+    }
+    auto image_ids_compare = [this](ImageId a, ImageId b) {
+        auto& image_a = slot_images[a];
+        auto& image_b = slot_images[b];
+        return image_a.modification_tick < image_b.modification_tick;
+    };
+    return *std::ranges::max_element(image_ids, image_ids_compare);
+}
+
+template <class P>
+std::optional<std::pair<typename TextureCache<P>::Image*, std::pair<u32, u32>>>
+TextureCache<P>::ObtainImage(const Tegra::DMA::ImageOperand& operand, bool mark_as_modified) {
+    ImageInfo dst_info(operand);
+    ImageId dst_id = FindDMAImage(dst_info, operand.address);
+    if (!dst_id) {
+        return std::nullopt;
+    }
+    auto& image = slot_images[dst_id];
+    auto base = image.TryFindBase(operand.address);
+    if (!base) {
+        return std::nullopt;
+    }
+    if (False(image.flags & ImageFlagBits::GpuModified)) {
+        // No need to waste time on an image that's synced with guest
+        return std::nullopt;
+    }
+    PrepareImage(dst_id, mark_as_modified, false);
+    auto& new_image = slot_images[dst_id];
+    lru_cache.Touch(new_image.lru_index, frame_tick);
+    return std::make_pair(&new_image, std::make_pair(base->level, base->layer));
+}
+
+template <class P>
 SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) {
     if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) {
         return NULL_SAMPLER_ID;