diff options
Diffstat (limited to '')
113 files changed, 5328 insertions, 1461 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 7f79111e0..cf9266d54 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -95,6 +95,12 @@ add_library(video_core STATIC memory_manager.h precompiled_headers.h pte_kind.h + query_cache/bank_base.h + query_cache/query_base.h + query_cache/query_cache_base.h + query_cache/query_cache.h + query_cache/query_stream.h + query_cache/types.h query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h @@ -275,6 +281,8 @@ add_library(video_core STATIC vulkan_common/nsight_aftermath_tracker.cpp vulkan_common/nsight_aftermath_tracker.h vulkan_common/vma.cpp + vulkan_common/vma.h + vulkan_common/vulkan.h ) create_target_directory_groups(video_core) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index f0f450edb..9e90c587c 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -272,13 +272,19 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad if (!cpu_addr) { return {&slot_buffers[NULL_BUFFER_ID], 0}; } - const BufferId buffer_id = FindBuffer(*cpu_addr, size); + return ObtainCPUBuffer(*cpu_addr, size, sync_info, post_op); +} + +template <class P> +std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainCPUBuffer( + VAddr cpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op) { + const BufferId buffer_id = FindBuffer(cpu_addr, size); Buffer& buffer = slot_buffers[buffer_id]; // synchronize op switch (sync_info) { case ObtainBufferSynchronize::FullSynchronize: - SynchronizeBuffer(buffer, *cpu_addr, size); + SynchronizeBuffer(buffer, cpu_addr, size); break; default: break; @@ -286,18 +292,21 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad switch (post_op) { case ObtainBufferOperation::MarkAsWritten: - MarkWrittenBuffer(buffer_id, *cpu_addr, size); + MarkWrittenBuffer(buffer_id, cpu_addr, size); break; case ObtainBufferOperation::DiscardWrite: { - IntervalType interval{*cpu_addr, size}; + VAddr cpu_addr_start = Common::AlignDown(cpu_addr, 64); + VAddr cpu_addr_end = Common::AlignUp(cpu_addr + size, 64); + IntervalType interval{cpu_addr_start, cpu_addr_end}; ClearDownload(interval); + common_ranges.subtract(interval); break; } default: break; } - return {&buffer, buffer.Offset(*cpu_addr)}; + return {&buffer, buffer.Offset(cpu_addr)}; } template <class P> @@ -1159,6 +1168,11 @@ void BufferCache<P>::UpdateDrawIndirect() { .size = static_cast<u32>(size), .buffer_id = FindBuffer(*cpu_addr, static_cast<u32>(size)), }; + VAddr cpu_addr_start = Common::AlignDown(*cpu_addr, 64); + VAddr cpu_addr_end = Common::AlignUp(*cpu_addr + size, 64); + IntervalType interval{cpu_addr_start, cpu_addr_end}; + ClearDownload(interval); + common_ranges.subtract(interval); }; if (current_draw_indirect->include_count) { update(current_draw_indirect->count_start_address, sizeof(u32), diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 0b7135d49..c4f6e8d12 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -295,6 +295,10 @@ public: [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op); + + [[nodiscard]] std::pair<Buffer*, u32> ObtainCPUBuffer(VAddr gpu_addr, u32 size, + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op); void FlushCachedWrites(); /// Return true when there are uncommitted buffers to be downloaded @@ -335,6 +339,14 @@ public: [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); + template <typename Func> + void BufferOperations(Func&& func) { + do { + channel_state->has_deleted_buffers = false; + func(); + } while (channel_state->has_deleted_buffers); + } + std::recursive_mutex mutex; Runtime& runtime; diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h index 46bc9e322..5574e1fba 100644 --- a/src/video_core/control/channel_state_cache.h +++ b/src/video_core/control/channel_state_cache.h @@ -51,7 +51,7 @@ public: virtual void CreateChannel(Tegra::Control::ChannelState& channel); /// Bind a channel for execution. - void BindToChannel(s32 id); + virtual void BindToChannel(s32 id); /// Erase channel's state. void EraseChannel(s32 id); diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 9f1b340a9..58ce0d8c2 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -14,6 +14,7 @@ namespace Tegra { constexpr u32 MacroRegistersStart = 0xE00; +constexpr u32 ComputeInline = 0x6D; DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_, MemoryManager& memory_manager_, Control::ChannelState& channel_state_) @@ -83,12 +84,35 @@ bool DmaPusher::Step() { dma_state.dma_get, command_list_header.size * sizeof(u32)); } } - Core::Memory::GpuGuestMemory<Tegra::CommandHeader, - Core::Memory::GuestMemoryFlags::UnsafeRead> - headers(memory_manager, dma_state.dma_get, command_list_header.size, &command_headers); - ProcessCommands(headers); + const auto safe_process = [&] { + Core::Memory::GpuGuestMemory<Tegra::CommandHeader, + Core::Memory::GuestMemoryFlags::SafeRead> + headers(memory_manager, dma_state.dma_get, command_list_header.size, + &command_headers); + ProcessCommands(headers); + }; + const auto unsafe_process = [&] { + Core::Memory::GpuGuestMemory<Tegra::CommandHeader, + Core::Memory::GuestMemoryFlags::UnsafeRead> + headers(memory_manager, dma_state.dma_get, command_list_header.size, + &command_headers); + ProcessCommands(headers); + }; + if (Settings::IsGPULevelHigh()) { + if (dma_state.method >= MacroRegistersStart) { + unsafe_process(); + return true; + } + if (subchannel_type[dma_state.subchannel] == Engines::EngineTypes::KeplerCompute && + dma_state.method == ComputeInline) { + unsafe_process(); + return true; + } + safe_process(); + return true; + } + unsafe_process(); } - return true; } diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 8a2784cdc..e46a8fa5c 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -130,8 +130,10 @@ public: void DispatchCalls(); - void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) { + void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id, + Engines::EngineTypes engine_type) { subchannels[subchannel_id] = engine; + subchannel_type[subchannel_id] = engine_type; } void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); @@ -159,7 +161,7 @@ private: u32 method_count; ///< Current method count u32 length_pending; ///< Large NI command length pending GPUVAddr dma_get; ///< Currently read segment - u64 dma_word_offset; ///< Current word ofset from address + u64 dma_word_offset; ///< Current word offset from address bool non_incrementing; ///< Current command's NI flag bool is_last_call; }; @@ -170,6 +172,7 @@ private: const bool ib_enable{true}; ///< IB mode enabled std::array<Engines::EngineInterface*, max_subchannels> subchannels{}; + std::array<Engines::EngineTypes, max_subchannels> subchannel_type; GPU& gpu; Core::System& system; diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h index 7c22c49f1..18d959143 100644 --- a/src/video_core/engines/draw_manager.h +++ b/src/video_core/engines/draw_manager.h @@ -46,6 +46,7 @@ public: }; struct IndirectParams { + bool is_byte_count; bool is_indexed; bool include_count; GPUVAddr count_start_address; diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h index 392322358..54631ee6c 100644 --- a/src/video_core/engines/engine_interface.h +++ b/src/video_core/engines/engine_interface.h @@ -11,6 +11,14 @@ namespace Tegra::Engines { +enum class EngineTypes : u32 { + KeplerCompute, + Maxwell3D, + Fermi2D, + MaxwellDMA, + KeplerMemory, +}; + class EngineInterface { public: virtual ~EngineInterface() = default; diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h index 7242d2529..21bf8aeb4 100644 --- a/src/video_core/engines/engine_upload.h +++ b/src/video_core/engines/engine_upload.h @@ -69,6 +69,14 @@ public: /// Binds a rasterizer to this engine. void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); + GPUVAddr ExecTargetAddress() const { + return regs.dest.Address(); + } + + u32 GetUploadSize() const { + return copy_size; + } + private: void ProcessData(std::span<const u8> read_buffer); diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index a38d9528a..cd61ab222 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -43,16 +43,33 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal switch (method) { case KEPLER_COMPUTE_REG_INDEX(exec_upload): { + UploadInfo info{.upload_address = upload_address, + .exec_address = upload_state.ExecTargetAddress(), + .copy_size = upload_state.GetUploadSize()}; + uploads.push_back(info); upload_state.ProcessExec(regs.exec_upload.linear != 0); break; } case KEPLER_COMPUTE_REG_INDEX(data_upload): { + upload_address = current_dma_segment; upload_state.ProcessData(method_argument, is_last_call); break; } - case KEPLER_COMPUTE_REG_INDEX(launch): + case KEPLER_COMPUTE_REG_INDEX(launch): { + const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address(); + + for (auto& data : uploads) { + const GPUVAddr offset = data.exec_address - launch_desc_loc; + if (offset / sizeof(u32) == LAUNCH_REG_INDEX(grid_dim_x) && + memory_manager.IsMemoryDirty(data.upload_address, data.copy_size)) { + indirect_compute = {data.upload_address}; + } + } + uploads.clear(); ProcessLaunch(); + indirect_compute = std::nullopt; break; + } default: break; } @@ -62,6 +79,7 @@ void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amoun u32 methods_pending) { switch (method) { case KEPLER_COMPUTE_REG_INDEX(data_upload): + upload_address = current_dma_segment; upload_state.ProcessData(base_start, amount); return; default: diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 2092e685f..735e05fb4 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -5,6 +5,7 @@ #include <array> #include <cstddef> +#include <optional> #include <vector> #include "common/bit_field.h" #include "common/common_funcs.h" @@ -36,6 +37,9 @@ namespace Tegra::Engines { #define KEPLER_COMPUTE_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32)) +#define LAUNCH_REG_INDEX(field_name) \ + (offsetof(Tegra::Engines::KeplerCompute::LaunchParams, field_name) / sizeof(u32)) + class KeplerCompute final : public EngineInterface { public: explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager); @@ -201,6 +205,10 @@ public: void CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) override; + std::optional<GPUVAddr> GetIndirectComputeAddress() const { + return indirect_compute; + } + private: void ProcessLaunch(); @@ -216,6 +224,15 @@ private: MemoryManager& memory_manager; VideoCore::RasterizerInterface* rasterizer = nullptr; Upload::State upload_state; + GPUVAddr upload_address; + + struct UploadInfo { + GPUVAddr upload_address; + GPUVAddr exec_address; + u32 copy_size; + }; + std::vector<UploadInfo> uploads; + std::optional<GPUVAddr> indirect_compute{}; }; #define ASSERT_REG_POSITION(field_name, position) \ diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index c3696096d..32d767d85 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -20,8 +20,6 @@ namespace Tegra::Engines { -using VideoCore::QueryType; - /// First register id that is actually a Macro call. constexpr u32 MacroRegistersStart = 0xE00; @@ -257,6 +255,7 @@ u32 Maxwell3D::GetMaxCurrentVertices() { const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); num_vertices = std::max( num_vertices, address_size / std::max(attribute.SizeInBytes(), array.stride.Value())); + break; } return num_vertices; } @@ -269,10 +268,13 @@ size_t Maxwell3D::EstimateIndexBufferSize() { std::numeric_limits<u32>::max()}; const size_t byte_size = regs.index_buffer.FormatSizeInBytes(); const size_t log2_byte_size = Common::Log2Ceil64(byte_size); + const size_t cap{GetMaxCurrentVertices() * 3 * byte_size}; + const size_t lower_cap = + std::min<size_t>(static_cast<size_t>(end_address - start_address), cap); return std::min<size_t>( memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[log2_byte_size]) / byte_size, - static_cast<size_t>(end_address - start_address)); + lower_cap); } u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) { @@ -496,27 +498,21 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { } void Maxwell3D::ProcessQueryGet() { + VideoCommon::QueryPropertiesFlags flags{}; + if (regs.report_semaphore.query.short_query == 0) { + flags |= VideoCommon::QueryPropertiesFlags::HasTimeout; + } + const GPUVAddr sequence_address{regs.report_semaphore.Address()}; + const VideoCommon::QueryType query_type = + static_cast<VideoCommon::QueryType>(regs.report_semaphore.query.report.Value()); + const u32 payload = regs.report_semaphore.payload; + const u32 subreport = regs.report_semaphore.query.sub_report; switch (regs.report_semaphore.query.operation) { case Regs::ReportSemaphore::Operation::Release: if (regs.report_semaphore.query.short_query != 0) { - const GPUVAddr sequence_address{regs.report_semaphore.Address()}; - const u32 payload = regs.report_semaphore.payload; - std::function<void()> operation([this, sequence_address, payload] { - memory_manager.Write<u32>(sequence_address, payload); - }); - rasterizer->SignalFence(std::move(operation)); - } else { - struct LongQueryResult { - u64_le value; - u64_le timestamp; - }; - const GPUVAddr sequence_address{regs.report_semaphore.Address()}; - const u32 payload = regs.report_semaphore.payload; - [this, sequence_address, payload] { - memory_manager.Write<u64>(sequence_address + sizeof(u64), system.GPU().GetTicks()); - memory_manager.Write<u64>(sequence_address, payload); - }(); + flags |= VideoCommon::QueryPropertiesFlags::IsAFence; } + rasterizer->Query(sequence_address, query_type, flags, payload, subreport); break; case Regs::ReportSemaphore::Operation::Acquire: // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that @@ -524,11 +520,7 @@ void Maxwell3D::ProcessQueryGet() { UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); break; case Regs::ReportSemaphore::Operation::ReportOnly: - if (const std::optional<u64> result = GetQueryResult()) { - // If the query returns an empty optional it means it's cached and deferred. - // In this case we have a non-empty result, so we stamp it immediately. - StampQueryResult(*result, regs.report_semaphore.query.short_query == 0); - } + rasterizer->Query(sequence_address, query_type, flags, payload, subreport); break; case Regs::ReportSemaphore::Operation::Trap: UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); @@ -540,6 +532,10 @@ void Maxwell3D::ProcessQueryGet() { } void Maxwell3D::ProcessQueryCondition() { + if (rasterizer->AccelerateConditionalRendering()) { + execute_on = true; + return; + } const GPUVAddr condition_address{regs.render_enable.Address()}; switch (regs.render_enable_override) { case Regs::RenderEnable::Override::AlwaysRender: @@ -549,10 +545,6 @@ void Maxwell3D::ProcessQueryCondition() { execute_on = false; break; case Regs::RenderEnable::Override::UseRenderEnable: { - if (rasterizer->AccelerateConditionalRendering()) { - execute_on = true; - return; - } switch (regs.render_enable.mode) { case Regs::RenderEnable::Mode::True: { execute_on = true; @@ -594,15 +586,9 @@ void Maxwell3D::ProcessQueryCondition() { } void Maxwell3D::ProcessCounterReset() { -#if ANDROID - if (!Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - return; - } -#endif switch (regs.clear_report_value) { case Regs::ClearReport::ZPassPixelCount: - rasterizer->ResetCounter(QueryType::SamplesPassed); + rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64); break; default: LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); @@ -616,28 +602,6 @@ void Maxwell3D::ProcessSyncPoint() { rasterizer->SignalSyncPoint(sync_point); } -std::optional<u64> Maxwell3D::GetQueryResult() { - switch (regs.report_semaphore.query.report) { - case Regs::ReportSemaphore::Report::Payload: - return regs.report_semaphore.payload; - case Regs::ReportSemaphore::Report::ZPassPixelCount64: -#if ANDROID - if (!Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - return 120; - } -#endif - // Deferred. - rasterizer->Query(regs.report_semaphore.Address(), QueryType::SamplesPassed, - system.GPU().GetTicks()); - return std::nullopt; - default: - LOG_DEBUG(HW_GPU, "Unimplemented query report type {}", - regs.report_semaphore.query.report.Value()); - return 1; - } -} - void Maxwell3D::ProcessCBBind(size_t stage_index) { // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader // stage. diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 6c19354e1..17faacc37 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -3182,9 +3182,6 @@ private: /// Handles writes to syncing register. void ProcessSyncPoint(); - /// Returns a query's value or an empty object if the value will be deferred through a cache. - std::optional<u64> GetQueryResult(); - void RefreshParametersImpl(); bool IsMethodExecutable(u32 method); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index cd8e24b0b..422d4d859 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -5,6 +5,7 @@ #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" +#include "common/polyfill_ranges.h" #include "common/settings.h" #include "core/core.h" #include "core/memory.h" @@ -108,10 +109,11 @@ void MaxwellDMA::Launch() { const bool is_const_a_dst = regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A; if (regs.launch_dma.remap_enable != 0 && is_const_a_dst) { ASSERT(regs.remap_const.component_size_minus_one == 3); - accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value); + accelerate.BufferClear(regs.offset_out, regs.line_length_in, + regs.remap_const.remap_consta_value); read_buffer.resize_destructive(regs.line_length_in * sizeof(u32)); std::span<u32> span(reinterpret_cast<u32*>(read_buffer.data()), regs.line_length_in); - std::ranges::fill(span, regs.remap_consta_value); + std::ranges::fill(span, regs.remap_const.remap_consta_value); memory_manager.WriteBlockUnsafe(regs.offset_out, reinterpret_cast<u8*>(read_buffer.data()), regs.line_length_in * sizeof(u32)); @@ -360,21 +362,17 @@ void MaxwellDMA::ReleaseSemaphore() { const auto type = regs.launch_dma.semaphore_type; const GPUVAddr address = regs.semaphore.address; const u32 payload = regs.semaphore.payload; + VideoCommon::QueryPropertiesFlags flags{VideoCommon::QueryPropertiesFlags::IsAFence}; switch (type) { case LaunchDMA::SemaphoreType::NONE: break; case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { - std::function<void()> operation( - [this, address, payload] { memory_manager.Write<u32>(address, payload); }); - rasterizer->SignalFence(std::move(operation)); + rasterizer->Query(address, VideoCommon::QueryType::Payload, flags, payload, 0); break; } case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { - std::function<void()> operation([this, address, payload] { - memory_manager.Write<u64>(address + sizeof(u64), system.GPU().GetTicks()); - memory_manager.Write<u64>(address, payload); - }); - rasterizer->SignalFence(std::move(operation)); + rasterizer->Query(address, VideoCommon::QueryType::Payload, + flags | VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); break; } default: diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 69e26cb32..1a43e24b6 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -214,14 +214,15 @@ public: NO_WRITE = 6, }; - PackedGPUVAddr address; + u32 remap_consta_value; + u32 remap_constb_value; union { + BitField<0, 12, u32> dst_components_raw; BitField<0, 3, Swizzle> dst_x; BitField<4, 3, Swizzle> dst_y; BitField<8, 3, Swizzle> dst_z; BitField<12, 3, Swizzle> dst_w; - BitField<0, 12, u32> dst_components_raw; BitField<16, 2, u32> component_size_minus_one; BitField<20, 2, u32> num_src_components_minus_one; BitField<24, 2, u32> num_dst_components_minus_one; @@ -274,55 +275,57 @@ private: struct Regs { union { struct { - u32 reserved[0x40]; + INSERT_PADDING_BYTES_NOINIT(0x100); u32 nop; - u32 reserved01[0xf]; + INSERT_PADDING_BYTES_NOINIT(0x3C); u32 pm_trigger; - u32 reserved02[0x3f]; + INSERT_PADDING_BYTES_NOINIT(0xFC); Semaphore semaphore; - u32 reserved03[0x2]; + INSERT_PADDING_BYTES_NOINIT(0x8); RenderEnable render_enable; PhysMode src_phys_mode; PhysMode dst_phys_mode; - u32 reserved04[0x26]; + INSERT_PADDING_BYTES_NOINIT(0x98); LaunchDMA launch_dma; - u32 reserved05[0x3f]; + INSERT_PADDING_BYTES_NOINIT(0xFC); PackedGPUVAddr offset_in; PackedGPUVAddr offset_out; s32 pitch_in; s32 pitch_out; u32 line_length_in; u32 line_count; - u32 reserved06[0xb6]; - u32 remap_consta_value; - u32 remap_constb_value; + INSERT_PADDING_BYTES_NOINIT(0x2E0); RemapConst remap_const; DMA::Parameters dst_params; - u32 reserved07[0x1]; + INSERT_PADDING_BYTES_NOINIT(0x4); DMA::Parameters src_params; - u32 reserved08[0x275]; + INSERT_PADDING_BYTES_NOINIT(0x9D4); u32 pm_trigger_end; - u32 reserved09[0x3ba]; + INSERT_PADDING_BYTES_NOINIT(0xEE8); }; std::array<u32, NUM_REGS> reg_array; }; } regs{}; + static_assert(sizeof(Regs) == NUM_REGS * 4); #define ASSERT_REG_POSITION(field_name, position) \ - static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4, \ + static_assert(offsetof(MaxwellDMA::Regs, field_name) == position, \ "Field " #field_name " has invalid position") - ASSERT_REG_POSITION(launch_dma, 0xC0); - ASSERT_REG_POSITION(offset_in, 0x100); - ASSERT_REG_POSITION(offset_out, 0x102); - ASSERT_REG_POSITION(pitch_in, 0x104); - ASSERT_REG_POSITION(pitch_out, 0x105); - ASSERT_REG_POSITION(line_length_in, 0x106); - ASSERT_REG_POSITION(line_count, 0x107); - ASSERT_REG_POSITION(remap_const, 0x1C0); - ASSERT_REG_POSITION(dst_params, 0x1C3); - ASSERT_REG_POSITION(src_params, 0x1CA); - + ASSERT_REG_POSITION(semaphore, 0x240); + ASSERT_REG_POSITION(render_enable, 0x254); + ASSERT_REG_POSITION(src_phys_mode, 0x260); + ASSERT_REG_POSITION(launch_dma, 0x300); + ASSERT_REG_POSITION(offset_in, 0x400); + ASSERT_REG_POSITION(offset_out, 0x408); + ASSERT_REG_POSITION(pitch_in, 0x410); + ASSERT_REG_POSITION(pitch_out, 0x414); + ASSERT_REG_POSITION(line_length_in, 0x418); + ASSERT_REG_POSITION(line_count, 0x41C); + ASSERT_REG_POSITION(remap_const, 0x700); + ASSERT_REG_POSITION(dst_params, 0x70C); + ASSERT_REG_POSITION(src_params, 0x728); + ASSERT_REG_POSITION(pm_trigger_end, 0x1114); #undef ASSERT_REG_POSITION }; diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp index 7718a09b3..8dd34c04a 100644 --- a/src/video_core/engines/puller.cpp +++ b/src/video_core/engines/puller.cpp @@ -34,19 +34,24 @@ void Puller::ProcessBindMethod(const MethodCall& method_call) { bound_engines[method_call.subchannel] = engine_id; switch (engine_id) { case EngineID::FERMI_TWOD_A: - dma_pusher.BindSubchannel(channel_state.fermi_2d.get(), method_call.subchannel); + dma_pusher.BindSubchannel(channel_state.fermi_2d.get(), method_call.subchannel, + EngineTypes::Fermi2D); break; case EngineID::MAXWELL_B: - dma_pusher.BindSubchannel(channel_state.maxwell_3d.get(), method_call.subchannel); + dma_pusher.BindSubchannel(channel_state.maxwell_3d.get(), method_call.subchannel, + EngineTypes::Maxwell3D); break; case EngineID::KEPLER_COMPUTE_B: - dma_pusher.BindSubchannel(channel_state.kepler_compute.get(), method_call.subchannel); + dma_pusher.BindSubchannel(channel_state.kepler_compute.get(), method_call.subchannel, + EngineTypes::KeplerCompute); break; case EngineID::MAXWELL_DMA_COPY_A: - dma_pusher.BindSubchannel(channel_state.maxwell_dma.get(), method_call.subchannel); + dma_pusher.BindSubchannel(channel_state.maxwell_dma.get(), method_call.subchannel, + EngineTypes::MaxwellDMA); break; case EngineID::KEPLER_INLINE_TO_MEMORY_B: - dma_pusher.BindSubchannel(channel_state.kepler_memory.get(), method_call.subchannel); + dma_pusher.BindSubchannel(channel_state.kepler_memory.get(), method_call.subchannel, + EngineTypes::KeplerMemory); break; default: UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); @@ -77,10 +82,8 @@ void Puller::ProcessSemaphoreTriggerMethod() { if (op == GpuSemaphoreOperation::WriteLong) { const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const u32 payload = regs.semaphore_sequence; - [this, sequence_address, payload] { - memory_manager.Write<u64>(sequence_address + sizeof(u64), gpu.GetTicks()); - memory_manager.Write<u64>(sequence_address, payload); - }(); + rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, + VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); } else { do { const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())}; @@ -115,10 +118,8 @@ void Puller::ProcessSemaphoreTriggerMethod() { void Puller::ProcessSemaphoreRelease() { const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const u32 payload = regs.semaphore_release; - std::function<void()> operation([this, sequence_address, payload] { - memory_manager.Write<u32>(sequence_address, payload); - }); - rasterizer->SignalFence(std::move(operation)); + rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, + VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0); } void Puller::ProcessSemaphoreAcquire() { @@ -127,7 +128,6 @@ void Puller::ProcessSemaphoreAcquire() { while (word != value) { regs.acquire_active = true; regs.acquire_value = value; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); rasterizer->ReleaseFences(); word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress()); // TODO(kemathe73) figure out how to do the acquire_timeout diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index ab20ff30f..805a89900 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -55,6 +55,9 @@ public: // Unlike other fences, this one doesn't void SignalOrdering() { + if constexpr (!can_async_check) { + TryReleasePendingFences<false>(); + } std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.AccumulateFlushes(); } @@ -104,9 +107,25 @@ public: SignalFence(std::move(func)); } - void WaitPendingFences() { + void WaitPendingFences([[maybe_unused]] bool force) { if constexpr (!can_async_check) { TryReleasePendingFences<true>(); + } else { + if (!force) { + return; + } + std::mutex wait_mutex; + std::condition_variable wait_cv; + std::atomic<bool> wait_finished{}; + std::function<void()> func([&] { + std::scoped_lock lk(wait_mutex); + wait_finished.store(true, std::memory_order_relaxed); + wait_cv.notify_all(); + }); + SignalFence(std::move(func)); + std::unique_lock lk(wait_mutex); + wait_cv.wait( + lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); }); } } diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index c192e33b2..11549d448 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -102,7 +102,8 @@ struct GPU::Impl { /// Signal the ending of command list. void OnCommandListEnd() { - rasterizer->ReleaseFences(); + rasterizer->ReleaseFences(false); + Settings::UpdateGPUAccuracy(); } /// Request a host GPU memory flush from the CPU. @@ -220,6 +221,7 @@ struct GPU::Impl { /// This can be used to launch any necessary threads and register any necessary /// core timing events. void Start() { + Settings::UpdateGPUAccuracy(); gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); } diff --git a/src/video_core/host1x/codecs/codec.cpp b/src/video_core/host1x/codecs/codec.cpp index da07a556f..8d7da50fc 100644 --- a/src/video_core/host1x/codecs/codec.cpp +++ b/src/video_core/host1x/codecs/codec.cpp @@ -247,7 +247,7 @@ void Codec::Initialize() { av_codec = avcodec_find_decoder(codec); InitializeAvCodecContext(); - if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::GPU) { + if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Gpu) { InitializeGpuDecoder(); } if (const int res = avcodec_open2(av_codec_ctx, av_codec, nullptr); res < 0) { @@ -319,6 +319,7 @@ void Codec::Decode() { LOG_WARNING(Service_NVDRV, "Zero width or height in frame"); return; } + bool is_interlaced = initial_frame->interlaced_frame != 0; if (av_codec_ctx->hw_device_ctx) { final_frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter}; ASSERT_MSG(final_frame, "av_frame_alloc final_frame failed"); @@ -334,7 +335,7 @@ void Codec::Decode() { UNIMPLEMENTED_MSG("Unexpected video format: {}", final_frame->format); return; } - if (!final_frame->interlaced_frame) { + if (!is_interlaced) { av_frames.push(std::move(final_frame)); } else { if (!filters_initialized) { diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index 862904e39..ece79b1e2 100644 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -84,7 +84,7 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters // TODO (ameerj): Where do we get this number, it seems to be particular for each stream const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue(); - const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::GPU; + const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::Gpu; const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u; writer.WriteUe(max_num_ref_frames); writer.WriteBit(false); diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index e61d9af80..8bb429578 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -19,6 +19,7 @@ set(SHADER_FILES block_linear_unswizzle_2d.comp block_linear_unswizzle_3d.comp convert_abgr8_to_d24s8.frag + convert_d32f_to_abgr8.frag convert_d24s8_to_abgr8.frag convert_depth_to_float.frag convert_float_to_depth.frag @@ -41,6 +42,9 @@ set(SHADER_FILES pitch_unswizzle.comp present_bicubic.frag present_gaussian.frag + queries_prefix_scan_sum.comp + queries_prefix_scan_sum_nosubgroups.comp + resolve_conditional_render.comp smaa_edge_detection.vert smaa_edge_detection.frag smaa_blending_weight_calculation.vert @@ -50,6 +54,7 @@ set(SHADER_FILES vulkan_blit_depth_stencil.frag vulkan_color_clear.frag vulkan_color_clear.vert + vulkan_depthstencil_clear.frag vulkan_fidelityfx_fsr_easu_fp16.comp vulkan_fidelityfx_fsr_easu_fp32.comp vulkan_fidelityfx_fsr_rcas_fp16.comp @@ -69,6 +74,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND") endif() set(GLSL_FLAGS "") +set(SPIR_V_VERSION "spirv1.3") set(QUIET_FLAG "--quiet") set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) @@ -122,7 +128,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) OUTPUT ${SPIRV_HEADER_FILE} COMMAND - ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} + ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION} MAIN_DEPENDENCY ${SOURCE_FILE} ) diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index bf2693559..5ff17cd0c 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -33,26 +33,14 @@ UNIFORM(6) uint block_height_mask; END_PUSH_CONSTANTS struct EncodingData { - uint encoding; - uint num_bits; - uint bit_value; - uint quint_trit_value; + uint data; }; -struct TexelWeightParams { - uvec2 size; - uint max_weight; - bool dual_plane; - bool error_state; - bool void_extent_ldr; - bool void_extent_hdr; -}; - -layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { +layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 { uvec4 astc_data[]; }; -layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; +layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly restrict image2DArray dest_image; const uint GOB_SIZE_X_SHIFT = 6; const uint GOB_SIZE_Y_SHIFT = 3; @@ -60,64 +48,21 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT; const uint BYTES_PER_BLOCK_LOG2 = 4; -const int JUST_BITS = 0; -const int QUINT = 1; -const int TRIT = 2; +const uint JUST_BITS = 0u; +const uint QUINT = 1u; +const uint TRIT = 2u; // ASTC Encodings data, sorted in ascending order based on their BitLength value // (see GetBitLength() function) -EncodingData encoding_values[22] = EncodingData[]( - EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0), - EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0), - EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0), - EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0), - EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0), - EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0), - EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0), - EncodingData(JUST_BITS, 8, 0, 0) -); - -// The following constants are expanded variants of the Replicate() -// function calls corresponding to the following arguments: -// value: index into the generated table -// num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4. -// to_bit: the integer after "TO_" -const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127); -const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511); - -const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255); -const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255); -const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255); -const uint REPLICATE_4_BIT_TO_8_TABLE[16] = - uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255); -const uint REPLICATE_5_BIT_TO_8_TABLE[32] = - uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165, - 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255); -const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63); -const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63); -const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63); -const uint REPLICATE_4_BIT_TO_6_TABLE[16] = - uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63); -const uint REPLICATE_5_BIT_TO_6_TABLE[32] = - uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, - 47, 49, 51, 53, 55, 57, 59, 61, 63); -const uint REPLICATE_6_BIT_TO_8_TABLE[64] = - uint[](0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 65, 69, 73, 77, 81, 85, 89, - 93, 97, 101, 105, 109, 113, 117, 121, 125, 130, 134, 138, 142, 146, 150, 154, 158, 162, - 166, 170, 174, 178, 182, 186, 190, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, - 239, 243, 247, 251, 255); -const uint REPLICATE_7_BIT_TO_8_TABLE[128] = - uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, - 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, - 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, - 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163, - 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, - 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, - 237, 239, 241, 243, 245, 247, 249, 251, 253, 255); +const uint encoding_values[22] = uint[]( + (JUST_BITS), (JUST_BITS | (1u << 8u)), (TRIT), (JUST_BITS | (2u << 8u)), + (QUINT), (TRIT | (1u << 8u)), (JUST_BITS | (3u << 8u)), (QUINT | (1u << 8u)), + (TRIT | (2u << 8u)), (JUST_BITS | (4u << 8u)), (QUINT | (2u << 8u)), (TRIT | (3u << 8u)), + (JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u)), + (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)), + (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u))); // Input ASTC texture globals -uint current_index = 0; -int bitsread = 0; int total_bitsread = 0; uvec4 local_buff; @@ -125,50 +70,60 @@ uvec4 local_buff; uvec4 color_endpoint_data; int color_bitsread = 0; -// Four values, two endpoints, four maximum partitions -uint color_values[32]; -int colvals_index = 0; - -// Weight data globals -uvec4 texel_weight_data; -int texel_bitsread = 0; +// Global "vector" to be pushed into when decoding +// At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode +// At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode +// So the maximum would be 144 (12 x 12) elements, x 2 for two planes +#define DIVCEIL(number, divisor) (number + divisor - 1) / divisor +#define ARRAY_NUM_ELEMENTS 144 +#define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4) +uint result_vector[ARRAY_NUM_ELEMENTS * 2]; -bool texel_flag = false; - -// Global "vectors" to be pushed into when decoding -EncodingData result_vector[144]; int result_index = 0; +uint result_vector_max_index; +bool result_limit_reached = false; -EncodingData texel_vector[144]; -int texel_vector_index = 0; +// EncodingData helpers +uint Encoding(EncodingData val) { + return bitfieldExtract(val.data, 0, 8); +} +uint NumBits(EncodingData val) { + return bitfieldExtract(val.data, 8, 8); +} +uint BitValue(EncodingData val) { + return bitfieldExtract(val.data, 16, 8); +} +uint QuintTritValue(EncodingData val) { + return bitfieldExtract(val.data, 24, 8); +} -uint unquantized_texel_weights[2][144]; +void Encoding(inout EncodingData val, uint v) { + val.data = bitfieldInsert(val.data, v, 0, 8); +} +void NumBits(inout EncodingData val, uint v) { + val.data = bitfieldInsert(val.data, v, 8, 8); +} +void BitValue(inout EncodingData val, uint v) { + val.data = bitfieldInsert(val.data, v, 16, 8); +} +void QuintTritValue(inout EncodingData val, uint v) { + val.data = bitfieldInsert(val.data, v, 24, 8); +} -uint SwizzleOffset(uvec2 pos) { - uint x = pos.x; - uint y = pos.y; - return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + - (y % 2) * 16 + (x % 16); +EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint quint_trit_val) { + return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) | + ((bit_val) << 16u) | ((quint_trit_val) << 24u)); } -// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] -// is the same as [(num_bits - 1):0] and repeats all the way down. -uint Replicate(uint val, uint num_bits, uint to_bit) { - const uint v = val & uint((1 << num_bits) - 1); - uint res = v; - uint reslen = num_bits; - while (reslen < to_bit) { - uint comp = 0; - if (num_bits > to_bit - reslen) { - uint newshift = to_bit - reslen; - comp = num_bits - newshift; - num_bits = newshift; - } - res = uint(res << num_bits); - res = uint(res | (v >> comp)); - reslen += num_bits; + +void ResultEmplaceBack(EncodingData val) { + if (result_index >= result_vector_max_index) { + // Alert callers to avoid decoding more than needed by this phase + result_limit_reached = true; + return; } - return res; + result_vector[result_index] = val.data; + ++result_index; } uvec4 ReplicateByteTo16(uvec4 value) { @@ -176,64 +131,40 @@ uvec4 ReplicateByteTo16(uvec4 value) { } uint ReplicateBitTo7(uint value) { - return REPLICATE_BIT_TO_7_TABLE[value]; + return value * 127; } uint ReplicateBitTo9(uint value) { - return REPLICATE_1_BIT_TO_9_TABLE[value]; + return value * 511; } -uint FastReplicate(uint value, uint num_bits, uint to_bit) { - if (num_bits == 0) { +uint ReplicateBits(uint value, uint num_bits, uint to_bit) { + if (value == 0 || num_bits == 0) { return 0; } - if (num_bits == to_bit) { + if (num_bits >= to_bit) { return value; } - if (to_bit == 6) { - switch (num_bits) { - case 1: - return REPLICATE_1_BIT_TO_6_TABLE[value]; - case 2: - return REPLICATE_2_BIT_TO_6_TABLE[value]; - case 3: - return REPLICATE_3_BIT_TO_6_TABLE[value]; - case 4: - return REPLICATE_4_BIT_TO_6_TABLE[value]; - case 5: - return REPLICATE_5_BIT_TO_6_TABLE[value]; - default: - break; - } - } else { /* if (to_bit == 8) */ - switch (num_bits) { - case 1: - return REPLICATE_1_BIT_TO_8_TABLE[value]; - case 2: - return REPLICATE_2_BIT_TO_8_TABLE[value]; - case 3: - return REPLICATE_3_BIT_TO_8_TABLE[value]; - case 4: - return REPLICATE_4_BIT_TO_8_TABLE[value]; - case 5: - return REPLICATE_5_BIT_TO_8_TABLE[value]; - case 6: - return REPLICATE_6_BIT_TO_8_TABLE[value]; - case 7: - return REPLICATE_7_BIT_TO_8_TABLE[value]; - default: - break; - } + const uint v = value & uint((1 << num_bits) - 1); + uint res = v; + uint reslen = num_bits; + while (reslen < to_bit) { + const uint num_dst_bits_to_shift_up = min(num_bits, to_bit - reslen); + const uint num_src_bits_to_shift_down = num_bits - num_dst_bits_to_shift_up; + + res <<= num_dst_bits_to_shift_up; + res |= (v >> num_src_bits_to_shift_down); + reslen += num_bits; } - return Replicate(value, num_bits, to_bit); + return res; } uint FastReplicateTo8(uint value, uint num_bits) { - return FastReplicate(value, num_bits, 8); + return ReplicateBits(value, num_bits, 8); } uint FastReplicateTo6(uint value, uint num_bits) { - return FastReplicate(value, num_bits, 6); + return ReplicateBits(value, num_bits, 6); } uint Div3Floor(uint v) { @@ -266,15 +197,15 @@ uint Hash52(uint p) { return p; } -uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { - if (small_block) { +uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) { + if ((block_dims.y * block_dims.x) < 32) { x <<= 1; y <<= 1; } seed += (partition_count - 1) * 1024; - uint rnum = Hash52(uint(seed)); + const uint rnum = Hash52(uint(seed)); uint seed1 = uint(rnum & 0xF); uint seed2 = uint((rnum >> 4) & 0xF); uint seed3 = uint((rnum >> 8) & 0xF); @@ -342,53 +273,52 @@ uint ExtractBits(uvec4 payload, int offset, int bits) { if (bits <= 0) { return 0; } - int last_offset = offset + bits - 1; - int shifted_offset = offset >> 5; + if (bits > 32) { + return 0; + } + const int last_offset = offset + bits - 1; + const int shifted_offset = offset >> 5; if ((last_offset >> 5) == shifted_offset) { return bitfieldExtract(payload[shifted_offset], offset & 31, bits); } - int first_bits = 32 - (offset & 31); - int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits)); - int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits)); + const int first_bits = 32 - (offset & 31); + const int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits)); + const int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits)); return result_first | (result_second << first_bits); } uint StreamBits(uint num_bits) { - int int_bits = int(num_bits); - uint ret = ExtractBits(local_buff, total_bitsread, int_bits); + const int int_bits = int(num_bits); + const uint ret = ExtractBits(local_buff, total_bitsread, int_bits); total_bitsread += int_bits; return ret; } +void SkipBits(uint num_bits) { + const int int_bits = int(num_bits); + total_bitsread += int_bits; +} + uint StreamColorBits(uint num_bits) { - uint ret = 0; - int int_bits = int(num_bits); - if (texel_flag) { - ret = ExtractBits(texel_weight_data, texel_bitsread, int_bits); - texel_bitsread += int_bits; - } else { - ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits); - color_bitsread += int_bits; - } + const int int_bits = int(num_bits); + const uint ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits); + color_bitsread += int_bits; return ret; } -void ResultEmplaceBack(EncodingData val) { - if (texel_flag) { - texel_vector[texel_vector_index] = val; - ++texel_vector_index; - } else { - result_vector[result_index] = val; - ++result_index; - } +EncodingData GetEncodingFromVector(uint index) { + const uint data = result_vector[index]; + return EncodingData(data); } // Returns the number of bits required to encode n_vals values. uint GetBitLength(uint n_vals, uint encoding_index) { - uint total_bits = encoding_values[encoding_index].num_bits * n_vals; - if (encoding_values[encoding_index].encoding == TRIT) { + const EncodingData encoding_value = EncodingData(encoding_values[encoding_index]); + const uint encoding = Encoding(encoding_value); + uint total_bits = NumBits(encoding_value) * n_vals; + if (encoding == TRIT) { total_bits += Div5Ceil(n_vals * 8); - } else if (encoding_values[encoding_index].encoding == QUINT) { + } else if (encoding == QUINT) { total_bits += Div3Ceil(n_vals * 7); } return total_bits; @@ -403,7 +333,7 @@ uint GetNumWeightValues(uvec2 size, bool dual_plane) { } uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { - uint n_vals = GetNumWeightValues(size, dual_plane); + const uint n_vals = GetNumWeightValues(size, dual_plane); return GetBitLength(n_vals, max_weight); } @@ -412,87 +342,74 @@ uint BitsBracket(uint bits, uint pos) { } uint BitsOp(uint bits, uint start, uint end) { - if (start == end) { - return BitsBracket(bits, start); - } else if (start > end) { - uint t = start; - start = end; - end = t; - } - - uint mask = (1 << (end - start + 1)) - 1; + const uint mask = (1 << (end - start + 1)) - 1; return ((bits >> start) & mask); } void DecodeQuintBlock(uint num_bits) { - uint m[3]; - uint q[3]; - uint Q; + uvec3 m; + uvec4 qQ; m[0] = StreamColorBits(num_bits); - Q = StreamColorBits(3); + qQ.w = StreamColorBits(3); m[1] = StreamColorBits(num_bits); - Q |= StreamColorBits(2) << 3; + qQ.w |= StreamColorBits(2) << 3; m[2] = StreamColorBits(num_bits); - Q |= StreamColorBits(2) << 5; - if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) { - q[0] = 4; - q[1] = 4; - q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) | - (BitsBracket(Q, 3) & ~BitsBracket(Q, 0)); + qQ.w |= StreamColorBits(2) << 5; + if (BitsOp(qQ.w, 1, 2) == 3 && BitsOp(qQ.w, 5, 6) == 0) { + qQ.x = 4; + qQ.y = 4; + qQ.z = (BitsBracket(qQ.w, 0) << 2) | ((BitsBracket(qQ.w, 4) & ~BitsBracket(qQ.w, 0)) << 1) | + (BitsBracket(qQ.w, 3) & ~BitsBracket(qQ.w, 0)); } else { uint C = 0; - if (BitsOp(Q, 1, 2) == 3) { - q[2] = 4; - C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0); + if (BitsOp(qQ.w, 1, 2) == 3) { + qQ.z = 4; + C = (BitsOp(qQ.w, 3, 4) << 3) | ((~BitsOp(qQ.w, 5, 6) & 3) << 1) | BitsBracket(qQ.w, 0); } else { - q[2] = BitsOp(Q, 5, 6); - C = BitsOp(Q, 0, 4); + qQ.z = BitsOp(qQ.w, 5, 6); + C = BitsOp(qQ.w, 0, 4); } if (BitsOp(C, 0, 2) == 5) { - q[1] = 4; - q[0] = BitsOp(C, 3, 4); + qQ.y = 4; + qQ.x = BitsOp(C, 3, 4); } else { - q[1] = BitsOp(C, 3, 4); - q[0] = BitsOp(C, 0, 2); + qQ.y = BitsOp(C, 3, 4); + qQ.x = BitsOp(C, 0, 2); } } for (uint i = 0; i < 3; i++) { - EncodingData val; - val.encoding = QUINT; - val.num_bits = num_bits; - val.bit_value = m[i]; - val.quint_trit_value = q[i]; + const EncodingData val = CreateEncodingData(QUINT, num_bits, m[i], qQ[i]); ResultEmplaceBack(val); } } void DecodeTritBlock(uint num_bits) { - uint m[5]; - uint t[5]; - uint T; + uvec4 m; + uvec4 t; + uvec3 Tm5t5; m[0] = StreamColorBits(num_bits); - T = StreamColorBits(2); + Tm5t5.x = StreamColorBits(2); m[1] = StreamColorBits(num_bits); - T |= StreamColorBits(2) << 2; + Tm5t5.x |= StreamColorBits(2) << 2; m[2] = StreamColorBits(num_bits); - T |= StreamColorBits(1) << 4; + Tm5t5.x |= StreamColorBits(1) << 4; m[3] = StreamColorBits(num_bits); - T |= StreamColorBits(2) << 5; - m[4] = StreamColorBits(num_bits); - T |= StreamColorBits(1) << 7; + Tm5t5.x |= StreamColorBits(2) << 5; + Tm5t5.y = StreamColorBits(num_bits); + Tm5t5.x |= StreamColorBits(1) << 7; uint C = 0; - if (BitsOp(T, 2, 4) == 7) { - C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1); - t[4] = 2; + if (BitsOp(Tm5t5.x, 2, 4) == 7) { + C = (BitsOp(Tm5t5.x, 5, 7) << 2) | BitsOp(Tm5t5.x, 0, 1); + Tm5t5.z = 2; t[3] = 2; } else { - C = BitsOp(T, 0, 4); - if (BitsOp(T, 5, 6) == 3) { - t[4] = 2; - t[3] = BitsBracket(T, 7); + C = BitsOp(Tm5t5.x, 0, 4); + if (BitsOp(Tm5t5.x, 5, 6) == 3) { + Tm5t5.z = 2; + t[3] = BitsBracket(Tm5t5.x, 7); } else { - t[4] = BitsBracket(T, 7); - t[3] = BitsOp(T, 5, 6); + Tm5t5.z = BitsBracket(Tm5t5.x, 7); + t[3] = BitsOp(Tm5t5.x, 5, 6); } } if (BitsOp(C, 0, 1) == 3) { @@ -508,31 +425,31 @@ void DecodeTritBlock(uint num_bits) { t[1] = BitsOp(C, 2, 3); t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1)); } - for (uint i = 0; i < 5; i++) { - EncodingData val; - val.encoding = TRIT; - val.num_bits = num_bits; - val.bit_value = m[i]; - val.quint_trit_value = t[i]; + for (uint i = 0; i < 4; i++) { + const EncodingData val = CreateEncodingData(TRIT, num_bits, m[i], t[i]); ResultEmplaceBack(val); } + const EncodingData val = CreateEncodingData(TRIT, num_bits, Tm5t5.y, Tm5t5.z); + ResultEmplaceBack(val); } void DecodeIntegerSequence(uint max_range, uint num_values) { - EncodingData val = encoding_values[max_range]; + EncodingData val = EncodingData(encoding_values[max_range]); + const uint encoding = Encoding(val); + const uint num_bits = NumBits(val); uint vals_decoded = 0; - while (vals_decoded < num_values) { - switch (val.encoding) { + while (vals_decoded < num_values && !result_limit_reached) { + switch (encoding) { case QUINT: - DecodeQuintBlock(val.num_bits); + DecodeQuintBlock(num_bits); vals_decoded += 3; break; case TRIT: - DecodeTritBlock(val.num_bits); + DecodeTritBlock(num_bits); vals_decoded += 5; break; case JUST_BITS: - val.bit_value = StreamColorBits(val.num_bits); + BitValue(val, StreamColorBits(num_bits)); ResultEmplaceBack(val); vals_decoded++; break; @@ -540,7 +457,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) { } } -void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { +void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, out uint color_values[32]) { uint num_values = 0; for (uint i = 0; i < num_partitions; i++) { num_values += ((modes[i] >> 2) + 1) << 1; @@ -549,7 +466,7 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { // TODO(ameerj): profile with binary search int range = 0; while (++range < encoding_values.length()) { - uint bit_length = GetBitLength(num_values, range); + const uint bit_length = GetBitLength(num_values, range); if (bit_length > color_data_bits) { break; } @@ -560,48 +477,49 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { if (out_index >= num_values) { break; } - EncodingData val = result_vector[itr]; - uint bitlen = val.num_bits; - uint bitval = val.bit_value; + const EncodingData val = GetEncodingFromVector(itr); + const uint encoding = Encoding(val); + const uint bitlen = NumBits(val); + const uint bitval = BitValue(val); uint A = 0, B = 0, C = 0, D = 0; A = ReplicateBitTo9((bitval & 1)); - switch (val.encoding) { + switch (encoding) { case JUST_BITS: - color_values[out_index++] = FastReplicateTo8(bitval, bitlen); + color_values[++out_index] = FastReplicateTo8(bitval, bitlen); break; case TRIT: { - D = val.quint_trit_value; + D = QuintTritValue(val); switch (bitlen) { case 1: C = 204; break; case 2: { C = 93; - uint b = (bitval >> 1) & 1; + const uint b = (bitval >> 1) & 1; B = (b << 8) | (b << 4) | (b << 2) | (b << 1); break; } case 3: { C = 44; - uint cb = (bitval >> 1) & 3; + const uint cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 2) | cb; break; } case 4: { C = 22; - uint dcb = (bitval >> 1) & 7; + const uint dcb = (bitval >> 1) & 7; B = (dcb << 6) | dcb; break; } case 5: { C = 11; - uint edcb = (bitval >> 1) & 0xF; + const uint edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 2); break; } case 6: { C = 5; - uint fedcb = (bitval >> 1) & 0x1F; + const uint fedcb = (bitval >> 1) & 0x1F; B = (fedcb << 4) | (fedcb >> 4); break; } @@ -609,32 +527,32 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { break; } case QUINT: { - D = val.quint_trit_value; + D = QuintTritValue(val); switch (bitlen) { case 1: C = 113; break; case 2: { C = 54; - uint b = (bitval >> 1) & 1; + const uint b = (bitval >> 1) & 1; B = (b << 8) | (b << 3) | (b << 2); break; } case 3: { C = 26; - uint cb = (bitval >> 1) & 3; + const uint cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 1) | (cb >> 1); break; } case 4: { C = 13; - uint dcb = (bitval >> 1) & 7; + const uint dcb = (bitval >> 1) & 7; B = (dcb << 6) | (dcb >> 1); break; } case 5: { C = 6; - uint edcb = (bitval >> 1) & 0xF; + const uint edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 3); break; } @@ -642,11 +560,11 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { break; } } - if (val.encoding != JUST_BITS) { + if (encoding != JUST_BITS) { uint T = (D * C) + B; T ^= A; T = (A & 0x80) | (T >> 2); - color_values[out_index++] = T; + color_values[++out_index] = T; } } } @@ -664,139 +582,136 @@ ivec2 BitTransferSigned(int a, int b) { } uvec4 ClampByte(ivec4 color) { - for (uint i = 0; i < 4; ++i) { - color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); - } - return uvec4(color); + return uvec4(clamp(color, 0, 255)); } ivec4 BlueContract(int a, int r, int g, int b) { return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); } -void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { +void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, uint color_values[32], + inout uint colvals_index) { #define READ_UINT_VALUES(N) \ - uint v[N]; \ + uvec4 V[2]; \ for (uint i = 0; i < N; i++) { \ - v[i] = color_values[colvals_index++]; \ + V[i / 4][i % 4] = color_values[++colvals_index]; \ } - #define READ_INT_VALUES(N) \ - int v[N]; \ + ivec4 V[2]; \ for (uint i = 0; i < N; i++) { \ - v[i] = int(color_values[colvals_index++]); \ + V[i / 4][i % 4] = int(color_values[++colvals_index]); \ } switch (color_endpoint_mode) { case 0: { READ_UINT_VALUES(2) - ep1 = uvec4(0xFF, v[0], v[0], v[0]); - ep2 = uvec4(0xFF, v[1], v[1], v[1]); + ep1 = uvec4(0xFF, V[0].x, V[0].x, V[0].x); + ep2 = uvec4(0xFF, V[0].y, V[0].y, V[0].y); break; } case 1: { READ_UINT_VALUES(2) - uint L0 = (v[0] >> 2) | (v[1] & 0xC0); - uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU); + const uint L0 = (V[0].x >> 2) | (V[0].y & 0xC0); + const uint L1 = min(L0 + (V[0].y & 0x3F), 0xFFU); ep1 = uvec4(0xFF, L0, L0, L0); ep2 = uvec4(0xFF, L1, L1, L1); break; } case 4: { READ_UINT_VALUES(4) - ep1 = uvec4(v[2], v[0], v[0], v[0]); - ep2 = uvec4(v[3], v[1], v[1], v[1]); + ep1 = uvec4(V[0].z, V[0].x, V[0].x, V[0].x); + ep2 = uvec4(V[0].w, V[0].y, V[0].y, V[0].y); break; } case 5: { READ_INT_VALUES(4) - ivec2 transferred = BitTransferSigned(v[1], v[0]); - v[1] = transferred.x; - v[0] = transferred.y; - transferred = BitTransferSigned(v[3], v[2]); - v[3] = transferred.x; - v[2] = transferred.y; - ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0])); - ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1])); + ivec2 transferred = BitTransferSigned(V[0].y, V[0].x); + V[0].y = transferred.x; + V[0].x = transferred.y; + transferred = BitTransferSigned(V[0].w, V[0].z); + V[0].w = transferred.x; + V[0].z = transferred.y; + ep1 = ClampByte(ivec4(V[0].z, V[0].x, V[0].x, V[0].x)); + ep2 = ClampByte(ivec4(V[0].z + V[0].w, V[0].x + V[0].y, V[0].x + V[0].y, V[0].x + V[0].y)); break; } case 6: { READ_UINT_VALUES(4) - ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); - ep2 = uvec4(0xFF, v[0], v[1], v[2]); + ep1 = uvec4(0xFF, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8); + ep2 = uvec4(0xFF, V[0].x, V[0].y, V[0].z); break; } case 8: { READ_UINT_VALUES(6) - if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { - ep1 = uvec4(0xFF, v[0], v[2], v[4]); - ep2 = uvec4(0xFF, v[1], v[3], v[5]); + if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) { + ep1 = uvec4(0xFF, V[0].x, V[0].z, V[1].x); + ep2 = uvec4(0xFF, V[0].y, V[0].w, V[1].y); } else { - ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5]))); - ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4]))); + ep1 = uvec4(BlueContract(0xFF, int(V[0].y), int(V[0].w), int(V[1].y))); + ep2 = uvec4(BlueContract(0xFF, int(V[0].x), int(V[0].z), int(V[1].x))); } break; } case 9: { READ_INT_VALUES(6) - ivec2 transferred = BitTransferSigned(v[1], v[0]); - v[1] = transferred.x; - v[0] = transferred.y; - transferred = BitTransferSigned(v[3], v[2]); - v[3] = transferred.x; - v[2] = transferred.y; - transferred = BitTransferSigned(v[5], v[4]); - v[5] = transferred.x; - v[4] = transferred.y; - if ((v[1] + v[3] + v[5]) >= 0) { - ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4])); - ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); + ivec2 transferred = BitTransferSigned(V[0].y, V[0].x); + V[0].y = transferred.x; + V[0].x = transferred.y; + transferred = BitTransferSigned(V[0].w, V[0].z); + V[0].w = transferred.x; + V[0].z = transferred.y; + transferred = BitTransferSigned(V[1].y, V[1].x); + V[1].y = transferred.x; + V[1].x = transferred.y; + if ((V[0].y + V[0].w + V[1].y) >= 0) { + ep1 = ClampByte(ivec4(0xFF, V[0].x, V[0].z, V[1].x)); + ep2 = ClampByte(ivec4(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); } else { - ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5])); - ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4])); + ep1 = ClampByte(BlueContract(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); + ep2 = ClampByte(BlueContract(0xFF, V[0].x, V[0].z, V[1].x)); } break; } case 10: { READ_UINT_VALUES(6) - ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8); - ep2 = uvec4(v[5], v[0], v[1], v[2]); + ep1 = uvec4(V[1].x, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8); + ep2 = uvec4(V[1].y, V[0].x, V[0].y, V[0].z); break; } case 12: { READ_UINT_VALUES(8) - if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) { - ep1 = uvec4(v[6], v[0], v[2], v[4]); - ep2 = uvec4(v[7], v[1], v[3], v[5]); + if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) { + ep1 = uvec4(V[1].z, V[0].x, V[0].z, V[1].x); + ep2 = uvec4(V[1].w, V[0].y, V[0].w, V[1].y); } else { - ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5]))); - ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4]))); + ep1 = uvec4(BlueContract(int(V[1].w), int(V[0].y), int(V[0].w), int(V[1].y))); + ep2 = uvec4(BlueContract(int(V[1].z), int(V[0].x), int(V[0].z), int(V[1].x))); } break; } case 13: { READ_INT_VALUES(8) - ivec2 transferred = BitTransferSigned(v[1], v[0]); - v[1] = transferred.x; - v[0] = transferred.y; - transferred = BitTransferSigned(v[3], v[2]); - v[3] = transferred.x; - v[2] = transferred.y; - - transferred = BitTransferSigned(v[5], v[4]); - v[5] = transferred.x; - v[4] = transferred.y; - - transferred = BitTransferSigned(v[7], v[6]); - v[7] = transferred.x; - v[6] = transferred.y; - - if ((v[1] + v[3] + v[5]) >= 0) { - ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4])); - ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5])); + ivec2 transferred = BitTransferSigned(V[0].y, V[0].x); + V[0].y = transferred.x; + V[0].x = transferred.y; + transferred = BitTransferSigned(V[0].w, V[0].z); + V[0].w = transferred.x; + V[0].z = transferred.y; + + transferred = BitTransferSigned(V[1].y, V[1].x); + V[1].y = transferred.x; + V[1].x = transferred.y; + + transferred = BitTransferSigned(V[1].w, V[1].z); + V[1].w = transferred.x; + V[1].z = transferred.y; + + if ((V[0].y + V[0].w + V[1].y) >= 0) { + ep1 = ClampByte(ivec4(V[1].z, V[0].x, V[0].z, V[1].x)); + ep2 = ClampByte(ivec4(V[1].w + V[1].z, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); } else { - ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5])); - ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4])); + ep1 = ClampByte(BlueContract(V[1].z + V[1].w, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); + ep2 = ClampByte(BlueContract(V[1].z, V[0].x, V[0].z, V[1].x)); } break; } @@ -812,36 +727,34 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { } uint UnquantizeTexelWeight(EncodingData val) { - uint bitval = val.bit_value; - uint bitlen = val.num_bits; - uint A = ReplicateBitTo7((bitval & 1)); + const uint encoding = Encoding(val); + const uint bitlen = NumBits(val); + const uint bitval = BitValue(val); + const uint A = ReplicateBitTo7((bitval & 1)); uint B = 0, C = 0, D = 0; uint result = 0; - switch (val.encoding) { + const uint bitlen_0_results[5] = {0, 16, 32, 48, 64}; + switch (encoding) { case JUST_BITS: - result = FastReplicateTo6(bitval, bitlen); - break; + return FastReplicateTo6(bitval, bitlen); case TRIT: { - D = val.quint_trit_value; + D = QuintTritValue(val); switch (bitlen) { - case 0: { - uint results[3] = {0, 32, 63}; - result = results[D]; - break; - } + case 0: + return bitlen_0_results[D * 2]; case 1: { C = 50; break; } case 2: { C = 23; - uint b = (bitval >> 1) & 1; + const uint b = (bitval >> 1) & 1; B = (b << 6) | (b << 2) | b; break; } case 3: { C = 11; - uint cb = (bitval >> 1) & 3; + const uint cb = (bitval >> 1) & 3; B = (cb << 5) | cb; break; } @@ -851,20 +764,17 @@ uint UnquantizeTexelWeight(EncodingData val) { break; } case QUINT: { - D = val.quint_trit_value; + D = QuintTritValue(val); switch (bitlen) { - case 0: { - uint results[5] = {0, 16, 32, 47, 63}; - result = results[D]; - break; - } + case 0: + return bitlen_0_results[D]; case 1: { C = 28; break; } case 2: { C = 13; - uint b = (bitval >> 1) & 1; + const uint b = (bitval >> 1) & 1; B = (b << 6) | (b << 1); break; } @@ -872,7 +782,7 @@ uint UnquantizeTexelWeight(EncodingData val) { break; } } - if (val.encoding != JUST_BITS && bitlen > 0) { + if (encoding != JUST_BITS && bitlen > 0) { result = D * C + B; result ^= A; result = (A & 0x20) | (result >> 2); @@ -883,61 +793,77 @@ uint UnquantizeTexelWeight(EncodingData val) { return result; } -void UnquantizeTexelWeights(bool dual_plane, uvec2 size) { - uint weight_idx = 0; - uint unquantized[2][144]; - uint area = size.x * size.y; - for (uint itr = 0; itr < texel_vector_index; itr++) { - unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]); - if (dual_plane) { - ++itr; - unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]); - if (itr == texel_vector_index) { - break; - } - } - if (++weight_idx >= (area)) - break; +void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) { + const uint num_planes = is_dual_plane ? 2 : 1; + const uint area = size.x * size.y; + const uint loop_count = min(result_index, area * num_planes); + for (uint itr = 0; itr < loop_count; ++itr) { + result_vector[itr] = + UnquantizeTexelWeight(GetEncodingFromVector(itr)); } +} + +uint GetUnquantizedTexelWieght(uint offset_base, uint plane, bool is_dual_plane) { + const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base; + return result_vector[offset]; +} +uvec4 GetUnquantizedWeightVector(uint t, uint s, uvec2 size, uint plane_index, bool is_dual_plane) { const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); - const uint k_plane_scale = dual_plane ? 2 : 1; - for (uint plane = 0; plane < k_plane_scale; plane++) { - for (uint t = 0; t < block_dims.y; t++) { - for (uint s = 0; s < block_dims.x; s++) { - uint cs = Ds * s; - uint ct = Dt * t; - uint gs = (cs * (size.x - 1) + 32) >> 6; - uint gt = (ct * (size.y - 1) + 32) >> 6; - uint js = gs >> 4; - uint fs = gs & 0xF; - uint jt = gt >> 4; - uint ft = gt & 0x0F; - uint w11 = (fs * ft + 8) >> 4; - uint w10 = ft - w11; - uint w01 = fs - w11; - uint w00 = 16 - fs - ft + w11; - uvec4 w = uvec4(w00, w01, w10, w11); - uint v0 = jt * size.x + js; - - uvec4 p = uvec4(0); - if (v0 < area) { - p.x = unquantized[plane][v0]; - } - if ((v0 + 1) < (area)) { - p.y = unquantized[plane][v0 + 1]; - } - if ((v0 + size.x) < (area)) { - p.z = unquantized[plane][(v0 + size.x)]; - } - if ((v0 + size.x + 1) < (area)) { - p.w = unquantized[plane][(v0 + size.x + 1)]; - } - unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4; - } + const uint area = size.x * size.y; + + const uint cs = Ds * s; + const uint ct = Dt * t; + const uint gs = (cs * (size.x - 1) + 32) >> 6; + const uint gt = (ct * (size.y - 1) + 32) >> 6; + const uint js = gs >> 4; + const uint fs = gs & 0xF; + const uint jt = gt >> 4; + const uint ft = gt & 0x0F; + const uint w11 = (fs * ft + 8) >> 4; + const uint w10 = ft - w11; + const uint w01 = fs - w11; + const uint w00 = 16 - fs - ft + w11; + const uvec4 w = uvec4(w00, w01, w10, w11); + const uint v0 = jt * size.x + js; + + uvec4 p0 = uvec4(0); + uvec4 p1 = uvec4(0); + + if (v0 < area) { + const uint offset_base = v0; + p0.x = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane); + p1.x = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane); + } + if ((v0 + 1) < (area)) { + const uint offset_base = v0 + 1; + p0.y = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane); + p1.y = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane); + } + if ((v0 + size.x) < (area)) { + const uint offset_base = v0 + size.x; + p0.z = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane); + p1.z = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane); + } + if ((v0 + size.x + 1) < (area)) { + const uint offset_base = v0 + size.x + 1; + p0.w = GetUnquantizedTexelWieght(offset_base, 0, is_dual_plane); + p1.w = GetUnquantizedTexelWieght(offset_base, 1, is_dual_plane); + } + + const uint primary_weight = (uint(dot(p0, w)) + 8) >> 4; + + uvec4 weight_vec = uvec4(primary_weight); + + if (is_dual_plane) { + const uint secondary_weight = (uint(dot(p1, w)) + 8) >> 4; + for (uint c = 0; c < 4; c++) { + const bool is_secondary = ((plane_index + 1u) & 3u) == c; + weight_vec[c] = is_secondary ? secondary_weight : primary_weight; } } + return weight_vec; } int FindLayout(uint mode) { @@ -971,80 +897,96 @@ int FindLayout(uint mode) { return 5; } -TexelWeightParams DecodeBlockInfo() { - TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false); - uint mode = StreamBits(11); + +void FillError(ivec3 coord) { + for (uint j = 0; j < block_dims.y; j++) { + for (uint i = 0; i < block_dims.x; i++) { + imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0)); + } + } +} + +void FillVoidExtentLDR(ivec3 coord) { + SkipBits(52); + const uint r_u = StreamBits(16); + const uint g_u = StreamBits(16); + const uint b_u = StreamBits(16); + const uint a_u = StreamBits(16); + const float a = float(a_u) / 65535.0f; + const float r = float(r_u) / 65535.0f; + const float g = float(g_u) / 65535.0f; + const float b = float(b_u) / 65535.0f; + for (uint j = 0; j < block_dims.y; j++) { + for (uint i = 0; i < block_dims.x; i++) { + imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a)); + } + } +} + +bool IsError(uint mode) { if ((mode & 0x1ff) == 0x1fc) { if ((mode & 0x200) != 0) { - params.void_extent_hdr = true; - } else { - params.void_extent_ldr = true; + // params.void_extent_hdr = true; + return true; } if ((mode & 0x400) == 0 || StreamBits(1) == 0) { - params.error_state = true; + return true; } - return params; + return false; } if ((mode & 0xf) == 0) { - params.error_state = true; - return params; + return true; } if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { - params.error_state = true; - return params; + return true; } + return false; +} + +uvec2 DecodeBlockSize(uint mode) { uint A, B; - uint mode_layout = FindLayout(mode); - switch (mode_layout) { + switch (FindLayout(mode)) { case 0: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x3; - params.size = uvec2(B + 4, A + 2); - break; + return uvec2(B + 4, A + 2); case 1: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x3; - params.size = uvec2(B + 8, A + 2); - break; + return uvec2(B + 8, A + 2); case 2: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x3; - params.size = uvec2(A + 2, B + 8); - break; + return uvec2(A + 2, B + 8); case 3: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x1; - params.size = uvec2(A + 2, B + 6); - break; + return uvec2(A + 2, B + 6); case 4: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x1; - params.size = uvec2(B + 2, A + 2); - break; + return uvec2(B + 2, A + 2); case 5: A = (mode >> 5) & 0x3; - params.size = uvec2(12, A + 2); - break; + return uvec2(12, A + 2); case 6: A = (mode >> 5) & 0x3; - params.size = uvec2(A + 2, 12); - break; + return uvec2(A + 2, 12); case 7: - params.size = uvec2(6, 10); - break; + return uvec2(6, 10); case 8: - params.size = uvec2(10, 6); - break; + return uvec2(10, 6); case 9: A = (mode >> 5) & 0x3; B = (mode >> 9) & 0x3; - params.size = uvec2(A + 6, B + 6); - break; + return uvec2(A + 6, B + 6); default: - params.error_state = true; - break; + return uvec2(0); } - params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); +} + +uint DecodeMaxWeight(uint mode) { + const uint mode_layout = FindLayout(mode); uint weight_index = (mode & 0x10) != 0 ? 1 : 0; if (mode_layout < 5) { weight_index |= (mode & 0x3) << 1; @@ -1053,64 +995,34 @@ TexelWeightParams DecodeBlockInfo() { } weight_index -= 2; if ((mode_layout != 9) && ((mode & 0x200) != 0)) { - const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12); - params.max_weight = max_weights[weight_index]; - } else { - const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6); - params.max_weight = max_weights[weight_index]; - } - return params; -} - -void FillError(ivec3 coord) { - for (uint j = 0; j < block_dims.y; j++) { - for (uint i = 0; i < block_dims.x; i++) { - imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0)); - } - } -} - -void FillVoidExtentLDR(ivec3 coord) { - StreamBits(52); - uint r_u = StreamBits(16); - uint g_u = StreamBits(16); - uint b_u = StreamBits(16); - uint a_u = StreamBits(16); - float a = float(a_u) / 65535.0f; - float r = float(r_u) / 65535.0f; - float g = float(g_u) / 65535.0f; - float b = float(b_u) / 65535.0f; - for (uint j = 0; j < block_dims.y; j++) { - for (uint i = 0; i < block_dims.x; i++) { - imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a)); - } + weight_index += 6; } + return weight_index + 1; } void DecompressBlock(ivec3 coord) { - TexelWeightParams params = DecodeBlockInfo(); - if (params.error_state) { - FillError(coord); - return; - } - if (params.void_extent_hdr) { + uint mode = StreamBits(11); + if (IsError(mode)) { FillError(coord); return; } - if (params.void_extent_ldr) { + if ((mode & 0x1ff) == 0x1fc) { + // params.void_extent_ldr = true; FillVoidExtentLDR(coord); return; } - if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) { + const uvec2 size_params = DecodeBlockSize(mode); + if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) { FillError(coord); return; } - uint num_partitions = StreamBits(2) + 1; - if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) { + const uint num_partitions = StreamBits(2) + 1; + const uint mode_layout = FindLayout(mode); + const bool dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); + if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) { FillError(coord); return; } - int plane_index = -1; uint partition_index = 1; uvec4 color_endpoint_mode = uvec4(0); uint ced_pointer = 0; @@ -1122,8 +1034,9 @@ void DecompressBlock(ivec3 coord) { partition_index = StreamBits(10); base_cem = StreamBits(6); } - uint base_mode = base_cem & 3; - uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight); + const uint base_mode = base_cem & 3; + const uint max_weight = DecodeMaxWeight(mode); + const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight); uint remaining_bits = 128 - weight_bits - total_bitsread; uint extra_cem_bits = 0; if (base_mode > 0) { @@ -1142,10 +1055,7 @@ void DecompressBlock(ivec3 coord) { } } remaining_bits -= extra_cem_bits; - uint plane_selector_bits = 0; - if (params.dual_plane) { - plane_selector_bits = 2; - } + const uint plane_selector_bits = dual_plane ? 2 : 0; remaining_bits -= plane_selector_bits; if (remaining_bits > 128) { // Bad data, more remaining bits than 4 bytes @@ -1153,17 +1063,17 @@ void DecompressBlock(ivec3 coord) { return; } // Read color data... - uint color_data_bits = remaining_bits; + const uint color_data_bits = remaining_bits; while (remaining_bits > 0) { - int nb = int(min(remaining_bits, 32U)); - uint b = StreamBits(nb); + const int nb = int(min(remaining_bits, 32U)); + const uint b = StreamBits(nb); color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); ++ced_pointer; remaining_bits -= nb; } - plane_index = int(StreamBits(plane_selector_bits)); + const uint plane_index = uint(StreamBits(plane_selector_bits)); if (base_mode > 0) { - uint extra_cem = StreamBits(extra_cem_bits); + const uint extra_cem = StreamBits(extra_cem_bits); uint cem = (extra_cem << 6) | base_cem; cem >>= 2; uvec4 C = uvec4(0); @@ -1185,70 +1095,80 @@ void DecompressBlock(ivec3 coord) { color_endpoint_mode[i] |= M[i]; } } else if (num_partitions > 1) { - uint cem = base_cem >> 2; + const uint cem = base_cem >> 2; for (uint i = 0; i < num_partitions; i++) { color_endpoint_mode[i] = cem; } } - DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); - uvec4 endpoints[4][2]; - for (uint i = 0; i < num_partitions; i++) { - ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); + uvec4 endpoints0[4]; + uvec4 endpoints1[4]; + { + // This decode phase should at most push 32 elements into the vector + result_vector_max_index = 32; + uint color_values[32]; + uint colvals_index = 0; + DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values); + for (uint i = 0; i < num_partitions; i++) { + ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values, + colvals_index); + } } + color_endpoint_data = local_buff; + color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; + const uint clear_byte_start = (weight_bits >> 3) + 1; - texel_weight_data = local_buff; - texel_weight_data = bitfieldReverse(texel_weight_data).wzyx; - uint clear_byte_start = - (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; - - uint byte_insert = ExtractBits(texel_weight_data, int(clear_byte_start - 1) * 8, 8) & - uint( - ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); - uint vec_index = (clear_byte_start - 1) >> 2; - texel_weight_data[vec_index] = - bitfieldInsert(texel_weight_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8); + const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & + uint(((1 << (weight_bits % 8)) - 1)); + const uint vec_index = (clear_byte_start - 1) >> 2; + color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert, + int((clear_byte_start - 1) % 4) * 8, 8); for (uint i = clear_byte_start; i < 16; ++i) { - uint idx = i >> 2; - texel_weight_data[idx] = bitfieldInsert(texel_weight_data[idx], 0, int(i % 4) * 8, 8); + const uint idx = i >> 2; + color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); } - texel_flag = true; // use texel "vector" and bit stream in integer decoding - DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); - UnquantizeTexelWeights(params.dual_plane, params.size); + // Re-init vector variables for next decode phase + result_index = 0; + color_bitsread = 0; + result_limit_reached = false; + // The limit for the Unquantize phase, avoids decoding more data than needed. + result_vector_max_index = size_params.x * size_params.y; + if (dual_plane) { + result_vector_max_index *= 2; + } + DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane)); + + UnquantizeTexelWeights(size_params, dual_plane); for (uint j = 0; j < block_dims.y; j++) { for (uint i = 0; i < block_dims.x; i++) { uint local_partition = 0; if (num_partitions > 1) { - local_partition = Select2DPartition(partition_index, i, j, num_partitions, - (block_dims.y * block_dims.x) < 32); - } - vec4 p; - uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]); - uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]); - uvec4 plane_vec = uvec4(0); - uvec4 weight_vec = uvec4(0); - for (uint c = 0; c < 4; c++) { - if (params.dual_plane && (((plane_index + 1) & 3) == c)) { - plane_vec[c] = 1; - } - weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i]; + local_partition = Select2DPartition(partition_index, i, j, num_partitions); } - vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); - p = (Cf / 65535.0); + const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]); + const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]); + const uvec4 weight_vec = GetUnquantizedWeightVector(j, i, size_params, plane_index, dual_plane); + const vec4 Cf = + vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); + const vec4 p = (Cf / 65535.0f); imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); } } } +uint SwizzleOffset(uvec2 pos) { + const uint x = pos.x; + const uint y = pos.y; + return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + + ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16); +} + void main() { uvec3 pos = gl_GlobalInvocationID; pos.x <<= BYTES_PER_BLOCK_LOG2; - - // Read as soon as possible due to its latency const uint swizzle = SwizzleOffset(pos.xy); - const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; uint offset = 0; @@ -1262,8 +1182,6 @@ void main() { if (any(greaterThanEqual(coord, imageSize(dest_image)))) { return; } - current_index = 0; - bitsread = 0; local_buff = astc_data[offset / 16]; DecompressBlock(coord); } diff --git a/src/video_core/host_shaders/convert_d32f_to_abgr8.frag b/src/video_core/host_shaders/convert_d32f_to_abgr8.frag new file mode 100644 index 000000000..04cfef8b5 --- /dev/null +++ b/src/video_core/host_shaders/convert_d32f_to_abgr8.frag @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 + +layout(binding = 0) uniform sampler2D depth_tex; + +layout(location = 0) out vec4 color; + +void main() { + ivec2 coord = ivec2(gl_FragCoord.xy); + float depth = textureLod(depth_tex, coord, 0).r; + color = vec4(depth, depth, depth, 1.0); +} diff --git a/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp b/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp index fc3854d18..66f2ad483 100644 --- a/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp +++ b/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp @@ -15,11 +15,14 @@ void main() { // TODO: Specialization constants for num_samples? const int num_samples = imageSamples(msaa_in); + const ivec3 msaa_size = imageSize(msaa_in); + const ivec3 out_size = imageSize(output_img); + const ivec3 scale = out_size / msaa_size; for (int curr_sample = 0; curr_sample < num_samples; ++curr_sample) { const vec4 pixel = imageLoad(msaa_in, coords, curr_sample); - const int single_sample_x = 2 * coords.x + (curr_sample & 1); - const int single_sample_y = 2 * coords.y + ((curr_sample / 2) & 1); + const int single_sample_x = scale.x * coords.x + (curr_sample & 1); + const int single_sample_y = scale.y * coords.y + ((curr_sample / 2) & 1); const ivec3 dest_coords = ivec3(single_sample_x, single_sample_y, coords.z); if (any(greaterThanEqual(dest_coords, imageSize(output_img)))) { diff --git a/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp b/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp index dedd962f1..c7ce38efa 100644 --- a/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp +++ b/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp @@ -15,9 +15,12 @@ void main() { // TODO: Specialization constants for num_samples? const int num_samples = imageSamples(output_msaa); + const ivec3 msaa_size = imageSize(output_msaa); + const ivec3 out_size = imageSize(img_in); + const ivec3 scale = out_size / msaa_size; for (int curr_sample = 0; curr_sample < num_samples; ++curr_sample) { - const int single_sample_x = 2 * coords.x + (curr_sample & 1); - const int single_sample_y = 2 * coords.y + ((curr_sample / 2) & 1); + const int single_sample_x = scale.x * coords.x + (curr_sample & 1); + const int single_sample_y = scale.y * coords.y + ((curr_sample / 2) & 1); const ivec3 single_coords = ivec3(single_sample_x, single_sample_y, coords.z); if (any(greaterThanEqual(single_coords, imageSize(img_in)))) { diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp new file mode 100644 index 000000000..6faa8981f --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp @@ -0,0 +1,173 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#version 460 core + +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_shuffle_relative : require +#extension GL_KHR_shader_subgroup_arithmetic : require + +#ifdef VULKAN + +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout(location = n) uniform +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uint min_accumulation_base; +UNIFORM(1) uint max_accumulation_base; +UNIFORM(2) uint accumulation_limit; +UNIFORM(3) uint buffer_offset; +END_PUSH_CONSTANTS + +#define LOCAL_RESULTS 8 +#define QUERIES_PER_INVOC 2048 + +layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; + +layout(std430, binding = 0) readonly buffer block1 { + uvec2 input_data[]; +}; + +layout(std430, binding = 1) coherent buffer block2 { + uvec2 output_data[]; +}; + +layout(std430, binding = 2) coherent buffer block3 { + uvec2 accumulated_data; +}; + +shared uvec2 shared_data[128]; + +// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 +uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { + uint carry = 0; + uvec2 result; + result.x = uaddCarry(value_1.x, value_2.x, carry); + result.y = value_1.y + value_2.y + carry; + return result; +} + +// do subgroup Prefix Sum using Hillis and Steele's algorithm +uvec2 subgroupInclusiveAddUint64(uvec2 value) { + uvec2 result = value; + for (uint i = 1; i < gl_SubgroupSize; i *= 2) { + uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; + if (i <= gl_SubgroupInvocationID) { + result = AddUint64(result, other); + } + } + return result; +} + +// Writes down the results to the output buffer and to the accumulation buffer +void WriteResults(uvec2 results[LOCAL_RESULTS]) { + const uint current_id = gl_LocalInvocationID.x; + const uvec2 accum = accumulated_data; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0); + AddUint64(results[i], base_data); + } + for (uint i = 0; i < LOCAL_RESULTS; i++) { + output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i]; + } + uint index = accumulation_limit % LOCAL_RESULTS; + uint base_id = accumulation_limit / LOCAL_RESULTS; + if (min_accumulation_base >= accumulation_limit + 1) { + if (current_id == base_id) { + accumulated_data = results[index]; + } + return; + } + // We have that ugly case in which the accumulation data is reset in the middle somewhere. + barrier(); + groupMemoryBarrier(); + + if (current_id == base_id) { + uvec2 reset_value = output_data[max_accumulation_base - 1]; + // Calculate two complement / negate manually + reset_value = AddUint64(uvec2(1,0), ~reset_value); + accumulated_data = AddUint64(results[index], reset_value); + } +} + +void main() { + const uint subgroup_inv_id = gl_SubgroupInvocationID; + const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups; + const uint last_subgroup_id = subgroupMax(subgroup_inv_id); + const uint current_id = gl_LocalInvocationID.x; + const uint total_work = accumulation_limit; + const uint last_result_id = LOCAL_RESULTS - 1; + uvec2 data[LOCAL_RESULTS]; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i]; + } + uvec2 results[LOCAL_RESULTS]; + results[0] = data[0]; + for (uint i = 1; i < LOCAL_RESULTS; i++) { + results[i] = AddUint64(data[i], results[i - 1]); + } + // make sure all input data has been loaded + subgroupBarrier(); + subgroupMemoryBarrier(); + + // on the last local result, do a subgroup inclusive scan sum + results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]); + // get the last local result from the subgroup behind the current + uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1); + if (subgroup_inv_id != 0) { + for (uint i = 1; i < LOCAL_RESULTS; i++) { + results[i - 1] = AddUint64(results[i - 1], result_behind); + } + } + + // if we had less queries than our subgroup, just write down the results. + if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch. + WriteResults(results); + return; + } + + // We now have more, so lets write the last result into shared memory. + // Only pick the last subgroup. + if (subgroup_inv_id == last_subgroup_id) { + shared_data[subgroup_id] = results[last_result_id]; + } + // wait until everyone loaded their stuffs + barrier(); + memoryBarrierShared(); + + // only if it's not the first subgroup + if (subgroup_id != 0) { + // get the results from some previous invocation + uvec2 tmp = shared_data[subgroup_inv_id]; + subgroupBarrier(); + subgroupMemoryBarrierShared(); + tmp = subgroupInclusiveAddUint64(tmp); + // obtain the result that would be equivalent to the previous result + uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1); + for (uint i = 0; i < LOCAL_RESULTS; i++) { + results[i] = AddUint64(results[i], shuffled_result); + } + } + WriteResults(results); +}
\ No newline at end of file diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp new file mode 100644 index 000000000..559a213b9 --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp @@ -0,0 +1,138 @@ +// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel +// SPDX-License-Identifier: MIT + +// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and +// Nicholas Haemel. Modified to suit needs. + +#version 460 core + +#ifdef VULKAN + +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout(location = n) uniform +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uint min_accumulation_base; +UNIFORM(1) uint max_accumulation_base; +UNIFORM(2) uint accumulation_limit; +UNIFORM(3) uint buffer_offset; +END_PUSH_CONSTANTS + +#define LOCAL_RESULTS 4 +#define QUERIES_PER_INVOC 2048 + +layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; + +layout(std430, binding = 0) readonly buffer block1 { + uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; +}; + +layout(std430, binding = 1) writeonly coherent buffer block2 { + uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; +}; + +layout(std430, binding = 2) coherent buffer block3 { + uvec2 accumulated_data; +}; + +shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; + +uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { + uint carry = 0; + uvec2 result; + result.x = uaddCarry(value_1.x, value_2.x, carry); + result.y = value_1.y + value_2.y + carry; + return result; +} + +void main(void) { + uint id = gl_LocalInvocationID.x; + uvec2 base_value[LOCAL_RESULTS]; + const uvec2 accum = accumulated_data; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base + ? accumulated_data + : uvec2(0); + } + uint work_size = gl_WorkGroupSize.x; + uint rd_id; + uint wr_id; + uint mask; + uvec2 inputs[LOCAL_RESULTS]; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i]; + } + // The number of steps is the log base 2 of the + // work group size, which should be a power of 2 + const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS)); + uint step = 0; + + // Each invocation is responsible for the content of + // two elements of the output array + for (uint i = 0; i < LOCAL_RESULTS; i++) { + shared_data[id * LOCAL_RESULTS + i] = inputs[i]; + } + // Synchronize to make sure that everyone has initialized + // their elements of shared_data[] with data loaded from + // the input arrays + barrier(); + memoryBarrierShared(); + // For each step... + for (step = 0; step < steps; step++) { + // Calculate the read and write index in the + // shared array + mask = (1 << step) - 1; + rd_id = ((id >> step) << (step + 1)) + mask; + wr_id = rd_id + 1 + (id & mask); + // Accumulate the read data into our element + + shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); + // Synchronize again to make sure that everyone + // has caught up with us + barrier(); + memoryBarrierShared(); + } + // Add the accumulation + for (uint i = 0; i < LOCAL_RESULTS; i++) { + shared_data[id * LOCAL_RESULTS + i] = + AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]); + } + barrier(); + memoryBarrierShared(); + + // Finally write our data back to the output buffer + for (uint i = 0; i < LOCAL_RESULTS; i++) { + output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i]; + } + if (id == 0) { + if (min_accumulation_base >= accumulation_limit + 1) { + accumulated_data = shared_data[accumulation_limit]; + return; + } + uvec2 reset_value = shared_data[max_accumulation_base - 1]; + uvec2 final_value = shared_data[accumulation_limit]; + // Two complements + reset_value = AddUint64(uvec2(1, 0), ~reset_value); + accumulated_data = AddUint64(final_value, reset_value); + } +}
\ No newline at end of file diff --git a/src/video_core/host_shaders/resolve_conditional_render.comp b/src/video_core/host_shaders/resolve_conditional_render.comp new file mode 100644 index 000000000..307e77d1a --- /dev/null +++ b/src/video_core/host_shaders/resolve_conditional_render.comp @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#version 450 + +layout(local_size_x = 1) in; + +layout(std430, binding = 0) buffer Query { + uvec2 initial; + uvec2 unknown; + uvec2 current; +}; + +layout(std430, binding = 1) buffer Result { + uint result; +}; + +void main() { + result = all(equal(initial, current)) ? 1 : 0; +} diff --git a/src/video_core/host_shaders/vulkan_depthstencil_clear.frag b/src/video_core/host_shaders/vulkan_depthstencil_clear.frag new file mode 100644 index 000000000..1ac177c7e --- /dev/null +++ b/src/video_core/host_shaders/vulkan_depthstencil_clear.frag @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 460 core + +layout (push_constant) uniform PushConstants { + vec4 clear_depth; +}; + +void main() { + gl_FragDepth = clear_depth.x; +} diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index 905505ca1..5d0bb9cc4 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -27,14 +27,24 @@ MICROPROFILE_DEFINE(MacroHLE, "GPU", "Execute macro HLE", MP_RGB(128, 192, 192)) namespace Tegra { -static void Dump(u64 hash, std::span<const u32> code) { +static void Dump(u64 hash, std::span<const u32> code, bool decompiled = false) { const auto base_dir{Common::FS::GetYuzuPath(Common::FS::YuzuPath::DumpDir)}; const auto macro_dir{base_dir / "macros"}; if (!Common::FS::CreateDir(base_dir) || !Common::FS::CreateDir(macro_dir)) { LOG_ERROR(Common_Filesystem, "Failed to create macro dump directories"); return; } - const auto name{macro_dir / fmt::format("{:016x}.macro", hash)}; + auto name{macro_dir / fmt::format("{:016x}.macro", hash)}; + + if (decompiled) { + auto new_name{macro_dir / fmt::format("decompiled_{:016x}.macro", hash)}; + if (Common::FS::Exists(name)) { + (void)Common::FS::RenameFile(name, new_name); + return; + } + name = new_name; + } + std::fstream macro_file(name, std::ios::out | std::ios::binary); if (!macro_file) { LOG_ERROR(Common_Filesystem, "Unable to open or create file at {}", @@ -90,9 +100,6 @@ void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) { if (!mid_method.has_value()) { cache_info.lle_program = Compile(macro_code->second); cache_info.hash = Common::HashValue(macro_code->second); - if (Settings::values.dump_macros) { - Dump(cache_info.hash, macro_code->second); - } } else { const auto& macro_cached = uploaded_macro_code[mid_method.value()]; const auto rebased_method = method - mid_method.value(); @@ -102,9 +109,6 @@ void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) { code.size() * sizeof(u32)); cache_info.hash = Common::HashValue(code); cache_info.lle_program = Compile(code); - if (Settings::values.dump_macros) { - Dump(cache_info.hash, code); - } } auto hle_program = hle_macros->GetHLEProgram(cache_info.hash); @@ -117,6 +121,10 @@ void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) { MICROPROFILE_SCOPE(MacroHLE); cache_info.hle_program->Execute(parameters, method); } + + if (Settings::values.dump_macros) { + Dump(cache_info.hash, macro_code->second, cache_info.has_hle_program); + } } } diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 6272a4652..046c8085e 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -67,6 +67,7 @@ public: } auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = false; params.is_indexed = false; params.include_count = false; params.count_start_address = 0; @@ -161,6 +162,7 @@ public: 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); } auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = false; params.is_indexed = true; params.include_count = false; params.count_start_address = 0; @@ -256,6 +258,7 @@ public: const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize()); maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = false; params.is_indexed = true; params.include_count = true; params.count_start_address = maxwell3d.GetMacroAddress(4); @@ -319,6 +322,47 @@ private: } }; +class HLE_DrawIndirectByteCount final : public HLEMacroImpl { +public: + explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override { + auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0xFFFFU); + if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) { + Fallback(parameters); + return; + } + + auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = true; + params.is_indexed = false; + params.include_count = false; + params.count_start_address = 0; + params.indirect_start_address = maxwell3d.GetMacroAddress(2); + params.buffer_size = 4; + params.max_draw_counts = 1; + params.stride = parameters[1]; + maxwell3d.regs.draw.begin = parameters[0]; + maxwell3d.regs.draw_auto_stride = parameters[1]; + maxwell3d.regs.draw_auto_byte_count = parameters[2]; + + maxwell3d.draw_manager->DrawArrayIndirect(topology); + } + +private: + void Fallback(const std::vector<u32>& parameters) { + maxwell3d.RefreshParameters(); + + maxwell3d.regs.draw.begin = parameters[0]; + maxwell3d.regs.draw_auto_stride = parameters[1]; + maxwell3d.regs.draw_auto_byte_count = parameters[2]; + + maxwell3d.draw_manager->DrawArray( + maxwell3d.regs.draw.topology, 0, + maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1); + } +}; + class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { public: explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} @@ -536,6 +580,11 @@ HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} { [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__); })); + builders.emplace(0xB5F74EDB717278ECULL, + std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>( + [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { + return std::make_unique<HLE_DrawIndirectByteCount>(maxwell3d__); + })); } HLEMacro::~HLEMacro() = default; diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 1528cc1dd..9fcaeeac7 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -25,6 +25,13 @@ #include "video_core/rasterizer_interface.h" #include "video_core/texture_cache/slot_vector.h" +namespace VideoCore { +enum class QueryType { + SamplesPassed, +}; +constexpr std::size_t NumQueryTypes = 1; +} // namespace VideoCore + namespace VideoCommon { using AsyncJobId = SlotId; @@ -98,12 +105,14 @@ private: }; template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> -class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { +class QueryCacheLegacy : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { public: - explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_) + explicit QueryCacheLegacy(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_) : rasterizer{rasterizer_}, - cpu_memory{cpu_memory_}, streams{{CounterStream{static_cast<QueryCache&>(*this), + // Use reinterpret_cast instead of static_cast as workaround for + // UBSan bug (https://github.com/llvm/llvm-project/issues/59060) + cpu_memory{cpu_memory_}, streams{{CounterStream{reinterpret_cast<QueryCache&>(*this), VideoCore::QueryType::SamplesPassed}}} { (void)slot_async_jobs.insert(); // Null value } diff --git a/src/video_core/query_cache/bank_base.h b/src/video_core/query_cache/bank_base.h new file mode 100644 index 000000000..44769ea97 --- /dev/null +++ b/src/video_core/query_cache/bank_base.h @@ -0,0 +1,105 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <atomic> +#include <deque> +#include <utility> + +#include "common/common_types.h" + +namespace VideoCommon { + +class BankBase { +protected: + const size_t base_bank_size{}; + size_t bank_size{}; + std::atomic<size_t> references{}; + size_t current_slot{}; + +public: + explicit BankBase(size_t bank_size_) : base_bank_size{bank_size_}, bank_size(bank_size_) {} + + virtual ~BankBase() = default; + + virtual std::pair<bool, size_t> Reserve() { + if (IsClosed()) { + return {false, bank_size}; + } + const size_t result = current_slot++; + return {true, result}; + } + + virtual void Reset() { + current_slot = 0; + references = 0; + bank_size = base_bank_size; + } + + size_t Size() const { + return bank_size; + } + + void AddReference(size_t how_many = 1) { + references.fetch_add(how_many, std::memory_order_relaxed); + } + + void CloseReference(size_t how_many = 1) { + if (how_many > references.load(std::memory_order_relaxed)) { + UNREACHABLE(); + } + references.fetch_sub(how_many, std::memory_order_relaxed); + } + + void Close() { + bank_size = current_slot; + } + + bool IsClosed() const { + return current_slot >= bank_size; + } + + bool IsDead() const { + return IsClosed() && references == 0; + } +}; + +template <typename BankType> +class BankPool { +private: + std::deque<BankType> bank_pool; + std::deque<size_t> bank_indices; + +public: + BankPool() = default; + ~BankPool() = default; + + // Reserve a bank from the pool and return its index + template <typename Func> + size_t ReserveBank(Func&& builder) { + if (!bank_indices.empty() && bank_pool[bank_indices.front()].IsDead()) { + size_t new_index = bank_indices.front(); + bank_indices.pop_front(); + bank_pool[new_index].Reset(); + bank_indices.push_back(new_index); + return new_index; + } + size_t new_index = bank_pool.size(); + builder(bank_pool, new_index); + bank_indices.push_back(new_index); + return new_index; + } + + // Get a reference to a bank using its index + BankType& GetBank(size_t index) { + return bank_pool[index]; + } + + // Get the total number of banks in the pool + size_t BankCount() const { + return bank_pool.size(); + } +}; + +} // namespace VideoCommon diff --git a/src/video_core/query_cache/query_base.h b/src/video_core/query_cache/query_base.h new file mode 100644 index 000000000..1d786b3a7 --- /dev/null +++ b/src/video_core/query_cache/query_base.h @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace VideoCommon { + +enum class QueryFlagBits : u32 { + HasTimestamp = 1 << 0, ///< Indicates if this query has a timestamp. + IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host + IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host + IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest. + IsHostManaged = 1 << 4, ///< Indicates if this query points to a host query + IsRewritten = 1 << 5, ///< Indicates if this query was rewritten by another query + IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified. + IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query. + IsFence = 1 << 8, ///< Indicates the query is a fence. +}; +DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits) + +class QueryBase { +public: + VAddr guest_address{}; + QueryFlagBits flags{}; + u64 value{}; + +protected: + // Default constructor + QueryBase() = default; + + // Parameterized constructor + QueryBase(VAddr address, QueryFlagBits flags_, u64 value_) + : guest_address(address), flags(flags_), value{value_} {} +}; + +class GuestQuery : public QueryBase { +public: + // Parameterized constructor + GuestQuery(bool isLong, VAddr address, u64 queryValue) + : QueryBase(address, QueryFlagBits::IsFinalValueSynced, queryValue) { + if (isLong) { + flags |= QueryFlagBits::HasTimestamp; + } + } +}; + +class HostQueryBase : public QueryBase { +public: + // Default constructor + HostQueryBase() : QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0) {} + + // Parameterized constructor + HostQueryBase(bool has_timestamp, VAddr address) + : QueryBase(address, QueryFlagBits::IsHostManaged, 0), start_bank_id{}, size_banks{}, + start_slot{}, size_slots{} { + if (has_timestamp) { + flags |= QueryFlagBits::HasTimestamp; + } + } + + u32 start_bank_id{}; + u32 size_banks{}; + size_t start_slot{}; + size_t size_slots{}; +}; + +} // namespace VideoCommon
\ No newline at end of file diff --git a/src/video_core/query_cache/query_cache.h b/src/video_core/query_cache/query_cache.h new file mode 100644 index 000000000..78b42b518 --- /dev/null +++ b/src/video_core/query_cache/query_cache.h @@ -0,0 +1,580 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <array> +#include <deque> +#include <memory> +#include <mutex> +#include <unordered_map> +#include <utility> + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/logging/log.h" +#include "common/scope_exit.h" +#include "common/settings.h" +#include "core/memory.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/query_cache/bank_base.h" +#include "video_core/query_cache/query_base.h" +#include "video_core/query_cache/query_cache_base.h" +#include "video_core/query_cache/query_stream.h" +#include "video_core/query_cache/types.h" + +namespace VideoCommon { + +using Maxwell = Tegra::Engines::Maxwell3D; + +struct SyncValuesStruct { + VAddr address; + u64 value; + u64 size; + + static constexpr bool GeneratesBaseBuffer = true; +}; + +template <typename Traits> +class GuestStreamer : public SimpleStreamer<GuestQuery> { +public: + using RuntimeType = typename Traits::RuntimeType; + + GuestStreamer(size_t id_, RuntimeType& runtime_) + : SimpleStreamer<GuestQuery>(id_), runtime{runtime_} {} + + virtual ~GuestStreamer() = default; + + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional<u32> subreport = std::nullopt) override { + auto new_id = BuildQuery(has_timestamp, address, static_cast<u64>(value)); + pending_sync.push_back(new_id); + return new_id; + } + + bool HasPendingSync() const override { + return !pending_sync.empty(); + } + + void SyncWrites() override { + if (pending_sync.empty()) { + return; + } + std::vector<SyncValuesStruct> sync_values; + sync_values.reserve(pending_sync.size()); + for (size_t pending_id : pending_sync) { + auto& query = slot_queries[pending_id]; + if (True(query.flags & QueryFlagBits::IsRewritten) || + True(query.flags & QueryFlagBits::IsInvalidated)) { + continue; + } + query.flags |= QueryFlagBits::IsHostSynced; + sync_values.emplace_back(SyncValuesStruct{ + .address = query.guest_address, + .value = query.value, + .size = static_cast<u64>(True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4)}); + } + pending_sync.clear(); + if (sync_values.size() > 0) { + runtime.template SyncValues<SyncValuesStruct>(sync_values); + } + } + +private: + RuntimeType& runtime; + std::deque<size_t> pending_sync; +}; + +template <typename Traits> +class StubStreamer : public GuestStreamer<Traits> { +public: + using RuntimeType = typename Traits::RuntimeType; + + StubStreamer(size_t id_, RuntimeType& runtime_, u32 stub_value_) + : GuestStreamer<Traits>(id_, runtime_), stub_value{stub_value_} {} + + ~StubStreamer() override = default; + + size_t WriteCounter(VAddr address, bool has_timestamp, [[maybe_unused]] u32 value, + std::optional<u32> subreport = std::nullopt) override { + size_t new_id = + GuestStreamer<Traits>::WriteCounter(address, has_timestamp, stub_value, subreport); + return new_id; + } + +private: + u32 stub_value; +}; + +template <typename Traits> +struct QueryCacheBase<Traits>::QueryCacheBaseImpl { + using RuntimeType = typename Traits::RuntimeType; + + QueryCacheBaseImpl(QueryCacheBase<Traits>* owner_, VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_, Tegra::GPU& gpu_) + : owner{owner_}, rasterizer{rasterizer_}, + cpu_memory{cpu_memory_}, runtime{runtime_}, gpu{gpu_} { + streamer_mask = 0; + for (size_t i = 0; i < static_cast<size_t>(QueryType::MaxQueryTypes); i++) { + streamers[i] = runtime.GetStreamerInterface(static_cast<QueryType>(i)); + if (streamers[i]) { + streamer_mask |= 1ULL << streamers[i]->GetId(); + } + } + } + + template <typename Func> + void ForEachStreamerIn(u64 mask, Func&& func) { + static constexpr bool RETURNS_BOOL = + std::is_same_v<std::invoke_result<Func, StreamerInterface*>, bool>; + while (mask != 0) { + size_t position = std::countr_zero(mask); + mask &= ~(1ULL << position); + if constexpr (RETURNS_BOOL) { + if (func(streamers[position])) { + return; + } + } else { + func(streamers[position]); + } + } + } + + template <typename Func> + void ForEachStreamer(Func&& func) { + ForEachStreamerIn(streamer_mask, func); + } + + QueryBase* ObtainQuery(QueryCacheBase<Traits>::QueryLocation location) { + size_t which_stream = location.stream_id.Value(); + auto* streamer = streamers[which_stream]; + if (!streamer) { + return nullptr; + } + return streamer->GetQuery(location.query_id.Value()); + } + + QueryCacheBase<Traits>* owner; + VideoCore::RasterizerInterface& rasterizer; + Core::Memory::Memory& cpu_memory; + RuntimeType& runtime; + Tegra::GPU& gpu; + std::array<StreamerInterface*, static_cast<size_t>(QueryType::MaxQueryTypes)> streamers; + u64 streamer_mask; + std::mutex flush_guard; + std::deque<u64> flushes_pending; + std::vector<QueryCacheBase<Traits>::QueryLocation> pending_unregister; +}; + +template <typename Traits> +QueryCacheBase<Traits>::QueryCacheBase(Tegra::GPU& gpu_, + VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_) + : cached_queries{} { + impl = std::make_unique<QueryCacheBase<Traits>::QueryCacheBaseImpl>( + this, rasterizer_, cpu_memory_, runtime_, gpu_); +} + +template <typename Traits> +QueryCacheBase<Traits>::~QueryCacheBase() = default; + +template <typename Traits> +void QueryCacheBase<Traits>::CounterEnable(QueryType counter_type, bool is_enabled) { + size_t index = static_cast<size_t>(counter_type); + StreamerInterface* streamer = impl->streamers[index]; + if (!streamer) [[unlikely]] { + UNREACHABLE(); + return; + } + if (is_enabled) { + streamer->StartCounter(); + } else { + streamer->PauseCounter(); + } +} + +template <typename Traits> +void QueryCacheBase<Traits>::CounterClose(QueryType counter_type) { + size_t index = static_cast<size_t>(counter_type); + StreamerInterface* streamer = impl->streamers[index]; + if (!streamer) [[unlikely]] { + UNREACHABLE(); + return; + } + streamer->CloseCounter(); +} + +template <typename Traits> +void QueryCacheBase<Traits>::CounterReset(QueryType counter_type) { + size_t index = static_cast<size_t>(counter_type); + StreamerInterface* streamer = impl->streamers[index]; + if (!streamer) [[unlikely]] { + UNIMPLEMENTED(); + return; + } + streamer->ResetCounter(); +} + +template <typename Traits> +void QueryCacheBase<Traits>::BindToChannel(s32 id) { + VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo>::BindToChannel(id); + impl->runtime.Bind3DEngine(maxwell3d); +} + +template <typename Traits> +void QueryCacheBase<Traits>::CounterReport(GPUVAddr addr, QueryType counter_type, + QueryPropertiesFlags flags, u32 payload, u32 subreport) { + const bool has_timestamp = True(flags & QueryPropertiesFlags::HasTimeout); + const bool is_fence = True(flags & QueryPropertiesFlags::IsAFence); + size_t streamer_id = static_cast<size_t>(counter_type); + auto* streamer = impl->streamers[streamer_id]; + if (streamer == nullptr) [[unlikely]] { + counter_type = QueryType::Payload; + payload = 1U; + streamer_id = static_cast<size_t>(counter_type); + streamer = impl->streamers[streamer_id]; + } + auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(addr); + if (!cpu_addr_opt) [[unlikely]] { + return; + } + VAddr cpu_addr = *cpu_addr_opt; + const size_t new_query_id = streamer->WriteCounter(cpu_addr, has_timestamp, payload, subreport); + auto* query = streamer->GetQuery(new_query_id); + if (is_fence) { + query->flags |= QueryFlagBits::IsFence; + } + QueryLocation query_location{}; + query_location.stream_id.Assign(static_cast<u32>(streamer_id)); + query_location.query_id.Assign(static_cast<u32>(new_query_id)); + const auto gen_caching_indexing = [](VAddr cur_addr) { + return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS, + static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK)); + }; + u8* pointer = impl->cpu_memory.GetPointer(cpu_addr); + u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8); + bool is_synced = !Settings::IsGPULevelHigh() && is_fence; + + std::function<void()> operation([this, is_synced, streamer, query_base = query, query_location, + pointer, pointer_timestamp] { + if (True(query_base->flags & QueryFlagBits::IsInvalidated)) { + if (!is_synced) [[likely]] { + impl->pending_unregister.push_back(query_location); + } + return; + } + if (False(query_base->flags & QueryFlagBits::IsFinalValueSynced)) [[unlikely]] { + UNREACHABLE(); + return; + } + query_base->value += streamer->GetAmmendValue(); + streamer->SetAccumulationValue(query_base->value); + if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { + u64 timestamp = impl->gpu.GetTicks(); + std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); + std::memcpy(pointer, &query_base->value, sizeof(query_base->value)); + } else { + u32 value = static_cast<u32>(query_base->value); + std::memcpy(pointer, &value, sizeof(value)); + } + if (!is_synced) [[likely]] { + impl->pending_unregister.push_back(query_location); + } + }); + if (is_fence) { + impl->rasterizer.SignalFence(std::move(operation)); + } else { + if (!Settings::IsGPULevelHigh() && counter_type == QueryType::Payload) { + if (has_timestamp) { + u64 timestamp = impl->gpu.GetTicks(); + u64 value = static_cast<u64>(payload); + std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp)); + std::memcpy(pointer, &value, sizeof(value)); + } else { + std::memcpy(pointer, &payload, sizeof(payload)); + } + streamer->Free(new_query_id); + return; + } + impl->rasterizer.SyncOperation(std::move(operation)); + } + if (is_synced) { + streamer->Free(new_query_id); + return; + } + auto [cont_addr, base] = gen_caching_indexing(cpu_addr); + { + std::scoped_lock lock(cache_mutex); + auto it1 = cached_queries.try_emplace(cont_addr); + auto& sub_container = it1.first->second; + auto it_current = sub_container.find(base); + if (it_current == sub_container.end()) { + sub_container.insert_or_assign(base, query_location); + return; + } + auto* old_query = impl->ObtainQuery(it_current->second); + old_query->flags |= QueryFlagBits::IsRewritten; + sub_container.insert_or_assign(base, query_location); + } +} + +template <typename Traits> +void QueryCacheBase<Traits>::UnregisterPending() { + const auto gen_caching_indexing = [](VAddr cur_addr) { + return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS, + static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK)); + }; + std::scoped_lock lock(cache_mutex); + for (QueryLocation loc : impl->pending_unregister) { + const auto [streamer_id, query_id] = loc.unpack(); + auto* streamer = impl->streamers[streamer_id]; + if (!streamer) [[unlikely]] { + continue; + } + auto* query = streamer->GetQuery(query_id); + auto [cont_addr, base] = gen_caching_indexing(query->guest_address); + auto it1 = cached_queries.find(cont_addr); + if (it1 != cached_queries.end()) { + auto it2 = it1->second.find(base); + if (it2 != it1->second.end()) { + if (it2->second.raw == loc.raw) { + it1->second.erase(it2); + } + } + } + streamer->Free(query_id); + } + impl->pending_unregister.clear(); +} + +template <typename Traits> +void QueryCacheBase<Traits>::NotifyWFI() { + bool should_sync = false; + impl->ForEachStreamer( + [&should_sync](StreamerInterface* streamer) { should_sync |= streamer->HasPendingSync(); }); + if (!should_sync) { + return; + } + + impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->PresyncWrites(); }); + impl->runtime.Barriers(true); + impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->SyncWrites(); }); + impl->runtime.Barriers(false); +} + +template <typename Traits> +void QueryCacheBase<Traits>::NotifySegment(bool resume) { + if (resume) { + impl->runtime.ResumeHostConditionalRendering(); + } else { + CounterClose(VideoCommon::QueryType::ZPassPixelCount64); + CounterClose(VideoCommon::QueryType::StreamingByteCount); + impl->runtime.PauseHostConditionalRendering(); + } +} + +template <typename Traits> +bool QueryCacheBase<Traits>::AccelerateHostConditionalRendering() { + bool qc_dirty = false; + const auto gen_lookup = [this, &qc_dirty](GPUVAddr address) -> VideoCommon::LookupData { + auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(address); + if (!cpu_addr_opt) [[unlikely]] { + return VideoCommon::LookupData{ + .address = 0, + .found_query = nullptr, + }; + } + VAddr cpu_addr = *cpu_addr_opt; + std::scoped_lock lock(cache_mutex); + auto it1 = cached_queries.find(cpu_addr >> Core::Memory::YUZU_PAGEBITS); + if (it1 == cached_queries.end()) { + return VideoCommon::LookupData{ + .address = cpu_addr, + .found_query = nullptr, + }; + } + auto& sub_container = it1->second; + auto it_current = sub_container.find(cpu_addr & Core::Memory::YUZU_PAGEMASK); + + if (it_current == sub_container.end()) { + auto it_current_2 = sub_container.find((cpu_addr & Core::Memory::YUZU_PAGEMASK) + 4); + if (it_current_2 == sub_container.end()) { + return VideoCommon::LookupData{ + .address = cpu_addr, + .found_query = nullptr, + }; + } + } + auto* query = impl->ObtainQuery(it_current->second); + qc_dirty |= True(query->flags & QueryFlagBits::IsHostManaged) && + False(query->flags & QueryFlagBits::IsGuestSynced); + return VideoCommon::LookupData{ + .address = cpu_addr, + .found_query = query, + }; + }; + + auto& regs = maxwell3d->regs; + if (regs.render_enable_override != Maxwell::Regs::RenderEnable::Override::UseRenderEnable) { + impl->runtime.EndHostConditionalRendering(); + return false; + } + const ComparisonMode mode = static_cast<ComparisonMode>(regs.render_enable.mode); + const GPUVAddr address = regs.render_enable.Address(); + switch (mode) { + case ComparisonMode::True: + impl->runtime.EndHostConditionalRendering(); + return false; + case ComparisonMode::False: + impl->runtime.EndHostConditionalRendering(); + return false; + case ComparisonMode::Conditional: { + VideoCommon::LookupData object_1{gen_lookup(address)}; + return impl->runtime.HostConditionalRenderingCompareValue(object_1, qc_dirty); + } + case ComparisonMode::IfEqual: { + VideoCommon::LookupData object_1{gen_lookup(address)}; + VideoCommon::LookupData object_2{gen_lookup(address + 16)}; + return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty, + true); + } + case ComparisonMode::IfNotEqual: { + VideoCommon::LookupData object_1{gen_lookup(address)}; + VideoCommon::LookupData object_2{gen_lookup(address + 16)}; + return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty, + false); + } + default: + return false; + } +} + +// Async downloads +template <typename Traits> +void QueryCacheBase<Traits>::CommitAsyncFlushes() { + // Make sure to have the results synced in Host. + NotifyWFI(); + + u64 mask{}; + { + std::scoped_lock lk(impl->flush_guard); + impl->ForEachStreamer([&mask](StreamerInterface* streamer) { + bool local_result = streamer->HasUnsyncedQueries(); + if (local_result) { + mask |= 1ULL << streamer->GetId(); + } + }); + impl->flushes_pending.push_back(mask); + } + std::function<void()> func([this] { UnregisterPending(); }); + impl->rasterizer.SyncOperation(std::move(func)); + if (mask == 0) { + return; + } + u64 ran_mask = ~mask; + while (mask) { + impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) { + u64 dep_mask = streamer->GetDependentMask(); + if ((dep_mask & ~ran_mask) != 0) { + return; + } + u64 index = streamer->GetId(); + ran_mask |= (1ULL << index); + mask &= ~(1ULL << index); + streamer->PushUnsyncedQueries(); + }); + } +} + +template <typename Traits> +bool QueryCacheBase<Traits>::HasUncommittedFlushes() const { + bool result = false; + impl->ForEachStreamer([&result](StreamerInterface* streamer) { + result |= streamer->HasUnsyncedQueries(); + return result; + }); + return result; +} + +template <typename Traits> +bool QueryCacheBase<Traits>::ShouldWaitAsyncFlushes() { + std::scoped_lock lk(impl->flush_guard); + return !impl->flushes_pending.empty() && impl->flushes_pending.front() != 0ULL; +} + +template <typename Traits> +void QueryCacheBase<Traits>::PopAsyncFlushes() { + u64 mask; + { + std::scoped_lock lk(impl->flush_guard); + mask = impl->flushes_pending.front(); + impl->flushes_pending.pop_front(); + } + if (mask == 0) { + return; + } + u64 ran_mask = ~mask; + while (mask) { + impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) { + u64 dep_mask = streamer->GetDependenceMask(); + if ((dep_mask & ~ran_mask) != 0) { + return; + } + u64 index = streamer->GetId(); + ran_mask |= (1ULL << index); + mask &= ~(1ULL << index); + streamer->PopUnsyncedQueries(); + }); + } +} + +// Invalidation + +template <typename Traits> +void QueryCacheBase<Traits>::InvalidateQuery(QueryCacheBase<Traits>::QueryLocation location) { + auto* query_base = impl->ObtainQuery(location); + if (!query_base) { + return; + } + query_base->flags |= QueryFlagBits::IsInvalidated; +} + +template <typename Traits> +bool QueryCacheBase<Traits>::IsQueryDirty(QueryCacheBase<Traits>::QueryLocation location) { + auto* query_base = impl->ObtainQuery(location); + if (!query_base) { + return false; + } + return True(query_base->flags & QueryFlagBits::IsHostManaged) && + False(query_base->flags & QueryFlagBits::IsGuestSynced); +} + +template <typename Traits> +bool QueryCacheBase<Traits>::SemiFlushQueryDirty(QueryCacheBase<Traits>::QueryLocation location) { + auto* query_base = impl->ObtainQuery(location); + if (!query_base) { + return false; + } + if (True(query_base->flags & QueryFlagBits::IsFinalValueSynced) && + False(query_base->flags & QueryFlagBits::IsGuestSynced)) { + auto* ptr = impl->cpu_memory.GetPointer(query_base->guest_address); + if (True(query_base->flags & QueryFlagBits::HasTimestamp)) { + std::memcpy(ptr, &query_base->value, sizeof(query_base->value)); + return false; + } + u32 value_l = static_cast<u32>(query_base->value); + std::memcpy(ptr, &value_l, sizeof(value_l)); + return false; + } + return True(query_base->flags & QueryFlagBits::IsHostManaged) && + False(query_base->flags & QueryFlagBits::IsGuestSynced); +} + +template <typename Traits> +void QueryCacheBase<Traits>::RequestGuestHostSync() { + impl->rasterizer.ReleaseFences(); +} + +} // namespace VideoCommon diff --git a/src/video_core/query_cache/query_cache_base.h b/src/video_core/query_cache/query_cache_base.h new file mode 100644 index 000000000..07be421c6 --- /dev/null +++ b/src/video_core/query_cache/query_cache_base.h @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <functional> +#include <mutex> +#include <optional> +#include <span> +#include <unordered_map> +#include <utility> + +#include "common/assert.h" +#include "common/bit_field.h" +#include "common/common_types.h" +#include "core/memory.h" +#include "video_core/control/channel_state_cache.h" +#include "video_core/query_cache/query_base.h" +#include "video_core/query_cache/types.h" + +namespace Core::Memory { +class Memory; +} + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Tegra { +class GPU; +} + +namespace VideoCommon { + +struct LookupData { + VAddr address; + QueryBase* found_query; +}; + +template <typename Traits> +class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { + using RuntimeType = typename Traits::RuntimeType; + +public: + union QueryLocation { + BitField<27, 5, u32> stream_id; + BitField<0, 27, u32> query_id; + u32 raw; + + std::pair<size_t, size_t> unpack() const { + return {static_cast<size_t>(stream_id.Value()), static_cast<size_t>(query_id.Value())}; + } + }; + + explicit QueryCacheBase(Tegra::GPU& gpu, VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_); + + ~QueryCacheBase(); + + void InvalidateRegion(VAddr addr, std::size_t size) { + IterateCache<true>(addr, size, + [this](QueryLocation location) { InvalidateQuery(location); }); + } + + void FlushRegion(VAddr addr, std::size_t size) { + bool result = false; + IterateCache<false>(addr, size, [this, &result](QueryLocation location) { + result |= SemiFlushQueryDirty(location); + return result; + }); + if (result) { + RequestGuestHostSync(); + } + } + + static u64 BuildMask(std::span<const QueryType> types) { + u64 mask = 0; + for (auto query_type : types) { + mask |= 1ULL << (static_cast<u64>(query_type)); + } + return mask; + } + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size) { + bool result = false; + IterateCache<false>(addr, size, [this, &result](QueryLocation location) { + result |= IsQueryDirty(location); + return result; + }); + return result; + } + + void CounterEnable(QueryType counter_type, bool is_enabled); + + void CounterReset(QueryType counter_type); + + void CounterClose(QueryType counter_type); + + void CounterReport(GPUVAddr addr, QueryType counter_type, QueryPropertiesFlags flags, + u32 payload, u32 subreport); + + void NotifyWFI(); + + bool AccelerateHostConditionalRendering(); + + // Async downloads + void CommitAsyncFlushes(); + + bool HasUncommittedFlushes() const; + + bool ShouldWaitAsyncFlushes(); + + void PopAsyncFlushes(); + + void NotifySegment(bool resume); + + void BindToChannel(s32 id) override; + +protected: + template <bool remove_from_cache, typename Func> + void IterateCache(VAddr addr, std::size_t size, Func&& func) { + static constexpr bool RETURNS_BOOL = + std::is_same_v<std::invoke_result<Func, QueryLocation>, bool>; + const u64 addr_begin = addr; + const u64 addr_end = addr_begin + size; + + const u64 page_end = addr_end >> Core::Memory::YUZU_PAGEBITS; + std::scoped_lock lock(cache_mutex); + for (u64 page = addr_begin >> Core::Memory::YUZU_PAGEBITS; page <= page_end; ++page) { + const u64 page_start = page << Core::Memory::YUZU_PAGEBITS; + const auto in_range = [page_start, addr_begin, addr_end](const u32 query_location) { + const u64 cache_begin = page_start + query_location; + const u64 cache_end = cache_begin + sizeof(u32); + return cache_begin < addr_end && addr_begin < cache_end; + }; + const auto& it = cached_queries.find(page); + if (it == std::end(cached_queries)) { + continue; + } + auto& contents = it->second; + for (auto& query : contents) { + if (!in_range(query.first)) { + continue; + } + if constexpr (RETURNS_BOOL) { + if (func(query.second)) { + return; + } + } else { + func(query.second); + } + } + if constexpr (remove_from_cache) { + const auto in_range2 = [&](const std::pair<u32, QueryLocation>& pair) { + return in_range(pair.first); + }; + std::erase_if(contents, in_range2); + } + } + } + + using ContentCache = std::unordered_map<u64, std::unordered_map<u32, QueryLocation>>; + + void InvalidateQuery(QueryLocation location); + bool IsQueryDirty(QueryLocation location); + bool SemiFlushQueryDirty(QueryLocation location); + void RequestGuestHostSync(); + void UnregisterPending(); + + std::unordered_map<u64, std::unordered_map<u32, QueryLocation>> cached_queries; + std::mutex cache_mutex; + + struct QueryCacheBaseImpl; + friend struct QueryCacheBaseImpl; + friend RuntimeType; + + std::unique_ptr<QueryCacheBaseImpl> impl; +}; + +} // namespace VideoCommon
\ No newline at end of file diff --git a/src/video_core/query_cache/query_stream.h b/src/video_core/query_cache/query_stream.h new file mode 100644 index 000000000..39da6ac07 --- /dev/null +++ b/src/video_core/query_cache/query_stream.h @@ -0,0 +1,149 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <deque> +#include <optional> +#include <vector> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/query_cache/bank_base.h" +#include "video_core/query_cache/query_base.h" + +namespace VideoCommon { + +class StreamerInterface { +public: + explicit StreamerInterface(size_t id_) : id{id_}, dependence_mask{}, dependent_mask{} {} + virtual ~StreamerInterface() = default; + + virtual QueryBase* GetQuery(size_t id) = 0; + + virtual void StartCounter() { + /* Do Nothing */ + } + + virtual void PauseCounter() { + /* Do Nothing */ + } + + virtual void ResetCounter() { + /* Do Nothing */ + } + + virtual void CloseCounter() { + /* Do Nothing */ + } + + virtual bool HasPendingSync() const { + return false; + } + + virtual void PresyncWrites() { + /* Do Nothing */ + } + + virtual void SyncWrites() { + /* Do Nothing */ + } + + virtual size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional<u32> subreport = std::nullopt) = 0; + + virtual bool HasUnsyncedQueries() const { + return false; + } + + virtual void PushUnsyncedQueries() { + /* Do Nothing */ + } + + virtual void PopUnsyncedQueries() { + /* Do Nothing */ + } + + virtual void Free(size_t query_id) = 0; + + size_t GetId() const { + return id; + } + + u64 GetDependenceMask() const { + return dependence_mask; + } + + u64 GetDependentMask() const { + return dependence_mask; + } + + u64 GetAmmendValue() const { + return ammend_value; + } + + void SetAccumulationValue(u64 new_value) { + acumulation_value = new_value; + } + +protected: + void MakeDependent(StreamerInterface* depend_on) { + dependence_mask |= 1ULL << depend_on->id; + depend_on->dependent_mask |= 1ULL << id; + } + + const size_t id; + u64 dependence_mask; + u64 dependent_mask; + u64 ammend_value{}; + u64 acumulation_value{}; +}; + +template <typename QueryType> +class SimpleStreamer : public StreamerInterface { +public: + explicit SimpleStreamer(size_t id_) : StreamerInterface{id_} {} + virtual ~SimpleStreamer() = default; + +protected: + virtual QueryType* GetQuery(size_t query_id) override { + if (query_id < slot_queries.size()) { + return &slot_queries[query_id]; + } + return nullptr; + } + + virtual void Free(size_t query_id) override { + std::scoped_lock lk(guard); + ReleaseQuery(query_id); + } + + template <typename... Args, typename = decltype(QueryType(std::declval<Args>()...))> + size_t BuildQuery(Args&&... args) { + std::scoped_lock lk(guard); + if (!old_queries.empty()) { + size_t new_id = old_queries.front(); + old_queries.pop_front(); + new (&slot_queries[new_id]) QueryType(std::forward<Args>(args)...); + return new_id; + } + size_t new_id = slot_queries.size(); + slot_queries.emplace_back(std::forward<Args>(args)...); + return new_id; + } + + void ReleaseQuery(size_t query_id) { + + if (query_id < slot_queries.size()) { + old_queries.push_back(query_id); + return; + } + UNREACHABLE(); + } + + std::mutex guard; + std::deque<QueryType> slot_queries; + std::deque<size_t> old_queries; +}; + +} // namespace VideoCommon
\ No newline at end of file diff --git a/src/video_core/query_cache/types.h b/src/video_core/query_cache/types.h new file mode 100644 index 000000000..e9226bbfc --- /dev/null +++ b/src/video_core/query_cache/types.h @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace VideoCommon { + +enum class QueryPropertiesFlags : u32 { + HasTimeout = 1 << 0, + IsAFence = 1 << 1, +}; +DECLARE_ENUM_FLAG_OPERATORS(QueryPropertiesFlags) + +// This should always be equivalent to maxwell3d Report Semaphore Reports +enum class QueryType : u32 { + Payload = 0, // "None" in docs, but confirmed via hardware to return the payload + VerticesGenerated = 1, + ZPassPixelCount = 2, + PrimitivesGenerated = 3, + AlphaBetaClocks = 4, + VertexShaderInvocations = 5, + StreamingPrimitivesNeededMinusSucceeded = 6, + GeometryShaderInvocations = 7, + GeometryShaderPrimitivesGenerated = 9, + ZCullStats0 = 10, + StreamingPrimitivesSucceeded = 11, + ZCullStats1 = 12, + StreamingPrimitivesNeeded = 13, + ZCullStats2 = 14, + ClipperInvocations = 15, + ZCullStats3 = 16, + ClipperPrimitivesGenerated = 17, + VtgPrimitivesOut = 18, + PixelShaderInvocations = 19, + ZPassPixelCount64 = 21, + IEEECleanColorTarget = 24, + IEEECleanZetaTarget = 25, + StreamingByteCount = 26, + TessellationInitInvocations = 27, + BoundingRectangle = 28, + TessellationShaderInvocations = 29, + TotalStreamingPrimitivesNeededMinusSucceeded = 30, + TessellationShaderPrimitivesGenerated = 31, + // max. + MaxQueryTypes, +}; + +// Comparison modes for Host Conditional Rendering +enum class ComparisonMode : u32 { + False = 0, + True = 1, + Conditional = 2, + IfEqual = 3, + IfNotEqual = 4, + MaxComparisonMode, +}; + +// Reduction ops. +enum class ReductionOp : u32 { + RedAdd = 0, + RedMin = 1, + RedMax = 2, + RedInc = 3, + RedDec = 4, + RedAnd = 5, + RedOr = 6, + RedXor = 7, + MaxReductionOp, +}; + +} // namespace VideoCommon
\ No newline at end of file diff --git a/src/video_core/rasterizer_accelerated.cpp b/src/video_core/rasterizer_accelerated.cpp index 4a197d65d..f200a650f 100644 --- a/src/video_core/rasterizer_accelerated.cpp +++ b/src/video_core/rasterizer_accelerated.cpp @@ -13,7 +13,8 @@ namespace VideoCore { using namespace Core::Memory; -RasterizerAccelerated::RasterizerAccelerated(Memory& cpu_memory_) : cpu_memory{cpu_memory_} {} +RasterizerAccelerated::RasterizerAccelerated(Memory& cpu_memory_) + : cached_pages(std::make_unique<CachedPages>()), cpu_memory{cpu_memory_} {} RasterizerAccelerated::~RasterizerAccelerated() = default; @@ -26,7 +27,7 @@ void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int del std::atomic_thread_fence(std::memory_order_acquire); const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); for (u64 page = addr >> YUZU_PAGEBITS; page != page_end; ++page) { - std::atomic_uint16_t& count = cached_pages.at(page >> 2).Count(page); + std::atomic_uint16_t& count = cached_pages->at(page >> 2).Count(page); if (delta > 0) { ASSERT_MSG(count.load(std::memory_order::relaxed) < UINT16_MAX, "Count may overflow!"); diff --git a/src/video_core/rasterizer_accelerated.h b/src/video_core/rasterizer_accelerated.h index 7118b8aff..e6c0ea87a 100644 --- a/src/video_core/rasterizer_accelerated.h +++ b/src/video_core/rasterizer_accelerated.h @@ -41,7 +41,8 @@ private: }; static_assert(sizeof(CacheEntry) == 8, "CacheEntry should be 8 bytes!"); - std::array<CacheEntry, 0x2000000> cached_pages; + using CachedPages = std::array<CacheEntry, 0x2000000>; + std::unique_ptr<CachedPages> cached_pages; Core::Memory::Memory& cpu_memory; }; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index cb8029a4f..af1469147 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -12,6 +12,7 @@ #include "video_core/cache_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" +#include "video_core/query_cache/types.h" #include "video_core/rasterizer_download_area.h" namespace Tegra { @@ -26,11 +27,6 @@ struct ChannelState; namespace VideoCore { -enum class QueryType { - SamplesPassed, -}; -constexpr std::size_t NumQueryTypes = 1; - enum class LoadCallbackStage { Prepare, Build, @@ -58,10 +54,11 @@ public: virtual void DispatchCompute() = 0; /// Resets the counter of a query - virtual void ResetCounter(QueryType type) = 0; + virtual void ResetCounter(VideoCommon::QueryType type) = 0; /// Records a GPU query and caches it - virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; + virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0; /// Signal an uniform buffer binding virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -83,7 +80,7 @@ public: virtual void SignalReference() = 0; /// Release all pending fences. - virtual void ReleaseFences() = 0; + virtual void ReleaseFences(bool force = true) = 0; /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index 3e12a8813..78ea5208b 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -89,9 +89,6 @@ public: void RequestScreenshot(void* data, std::function<void(bool)> callback, const Layout::FramebufferLayout& layout); - /// This is called to notify the rendering backend of a surface change - virtual void NotifySurfaceChanged() {} - protected: Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle. std::unique_ptr<Core::Frontend::GraphicsContext> context; diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp index 92ecf6682..65cd5aa06 100644 --- a/src/video_core/renderer_null/null_rasterizer.cpp +++ b/src/video_core/renderer_null/null_rasterizer.cpp @@ -26,16 +26,18 @@ void RasterizerNull::Draw(bool is_indexed, u32 instance_count) {} void RasterizerNull::DrawTexture() {} void RasterizerNull::Clear(u32 layer_count) {} void RasterizerNull::DispatchCompute() {} -void RasterizerNull::ResetCounter(VideoCore::QueryType type) {} -void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, - std::optional<u64> timestamp) { +void RasterizerNull::ResetCounter(VideoCommon::QueryType type) {} +void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { if (!gpu_memory) { return; } - - gpu_memory->Write(gpu_addr, u64{0}); - if (timestamp) { - gpu_memory->Write(gpu_addr + 8, *timestamp); + if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { + u64 ticks = m_gpu.GetTicks(); + gpu_memory->Write<u64>(gpu_addr + 8, ticks); + gpu_memory->Write<u64>(gpu_addr, static_cast<u64>(payload)); + } else { + gpu_memory->Write<u32>(gpu_addr, payload); } } void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -74,7 +76,7 @@ void RasterizerNull::SignalSyncPoint(u32 value) { syncpoint_manager.IncrementHost(value); } void RasterizerNull::SignalReference() {} -void RasterizerNull::ReleaseFences() {} +void RasterizerNull::ReleaseFences(bool) {} void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} void RasterizerNull::WaitForIdle() {} void RasterizerNull::FragmentBarrier() {} diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index 93b9a6971..23001eeb8 100644 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h @@ -42,8 +42,9 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCore::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; + void ResetCounter(VideoCommon::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -63,7 +64,7 @@ public: void SyncOperation(std::function<void()>&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences() override; + void ReleaseFences(bool force) override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index f9ca55c36..d70501860 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -34,13 +34,13 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac : texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, program_manager{program_manager_}, info{info_} { switch (device.GetShaderBackend()) { - case Settings::ShaderBackend::GLSL: + case Settings::ShaderBackend::Glsl: source_program = CreateProgram(code, GL_COMPUTE_SHADER); break; - case Settings::ShaderBackend::GLASM: + case Settings::ShaderBackend::Glasm: assembly_program = CompileProgram(code, GL_COMPUTE_PROGRAM_NV); break; - case Settings::ShaderBackend::SPIRV: + case Settings::ShaderBackend::SpirV: source_program = CreateProgram(code_v, GL_COMPUTE_SHADER); break; } diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 33e63c17d..94258ccd0 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -106,6 +106,43 @@ bool IsASTCSupported() { return true; } +static bool HasSlowSoftwareAstc(std::string_view vendor_name, std::string_view renderer) { +// ifdef for Unix reduces string comparisons for non-Windows drivers, and Intel +#ifdef YUZU_UNIX + // Sorted vaguely by how likely a vendor is to appear + if (vendor_name == "AMD") { + // RadeonSI + return true; + } + if (vendor_name == "Intel") { + // Must be inside YUZU_UNIX ifdef as the Windows driver uses the same vendor string + // iris, crocus + const bool is_intel_dg = (renderer.find("DG") != std::string_view::npos); + return is_intel_dg; + } + if (vendor_name == "nouveau") { + return true; + } + if (vendor_name == "X.Org") { + // R600 + return true; + } +#endif + if (vendor_name == "Collabora Ltd") { + // Zink + return true; + } + if (vendor_name == "Microsoft Corporation") { + // d3d12 + return true; + } + if (vendor_name == "Mesa/X.org") { + // llvmpipe, softpipe, virgl + return true; + } + return false; +} + [[nodiscard]] bool IsDebugToolAttached(std::span<const std::string_view> extensions) { const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); return nsight || HasExtension(extensions, "GL_EXT_debug_tool") || @@ -120,12 +157,16 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { } vendor_name = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); + const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); const std::vector extensions = GetExtensions(); const bool is_nvidia = vendor_name == "NVIDIA Corporation"; const bool is_amd = vendor_name == "ATI Technologies Inc."; const bool is_intel = vendor_name == "Intel"; + const bool has_slow_software_astc = + !is_nvidia && !is_amd && HasSlowSoftwareAstc(vendor_name, renderer); + #ifdef __unix__ constexpr bool is_linux = true; #else @@ -152,7 +193,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted"); has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod"); - has_astc = IsASTCSupported(); + has_astc = !has_slow_software_astc && IsASTCSupported(); has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); @@ -177,15 +218,15 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; shader_backend = Settings::values.shader_backend.GetValue(); - use_assembly_shaders = shader_backend == Settings::ShaderBackend::GLASM && + use_assembly_shaders = shader_backend == Settings::ShaderBackend::Glasm && GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; - if (shader_backend == Settings::ShaderBackend::GLASM && !use_assembly_shaders) { + if (shader_backend == Settings::ShaderBackend::Glasm && !use_assembly_shaders) { LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported"); - shader_backend = Settings::ShaderBackend::GLSL; + shader_backend = Settings::ShaderBackend::Glsl; } - if (shader_backend == Settings::ShaderBackend::GLSL && is_nvidia) { + if (shader_backend == Settings::ShaderBackend::Glsl && is_nvidia) { const std::string_view driver_version = version.substr(13); const int version_major = std::atoi(driver_version.substr(0, driver_version.find(".")).data()); diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index 71f720c63..44a771d65 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -220,7 +220,8 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c ASSERT(num_textures <= MAX_TEXTURES); ASSERT(num_images <= MAX_IMAGES); - const bool assembly_shaders{assembly_programs[0].handle != 0}; + const auto backend = device.GetShaderBackend(); + const bool assembly_shaders{backend == Settings::ShaderBackend::Glasm}; use_storage_buffers = !assembly_shaders || num_storage_buffers <= device.GetMaxGLASMStorageBufferBlocks(); writes_global_memory &= !use_storage_buffers; @@ -230,24 +231,23 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c GenerateTransformFeedbackState(); } const bool in_parallel = thread_worker != nullptr; - const auto backend = device.GetShaderBackend(); auto func{[this, sources_ = std::move(sources), sources_spirv_ = std::move(sources_spirv), shader_notify, backend, in_parallel, force_context_flush](ShaderContext::Context*) mutable { for (size_t stage = 0; stage < 5; ++stage) { switch (backend) { - case Settings::ShaderBackend::GLSL: + case Settings::ShaderBackend::Glsl: if (!sources_[stage].empty()) { source_programs[stage] = CreateProgram(sources_[stage], Stage(stage)); } break; - case Settings::ShaderBackend::GLASM: + case Settings::ShaderBackend::Glasm: if (!sources_[stage].empty()) { assembly_programs[stage] = CompileProgram(sources_[stage], AssemblyStage(stage)); } break; - case Settings::ShaderBackend::SPIRV: + case Settings::ShaderBackend::SpirV: if (!sources_spirv_[stage].empty()) { source_programs[stage] = CreateProgram(sources_spirv_[stage], Stage(stage)); } @@ -559,15 +559,13 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } void GraphicsPipeline::ConfigureTransformFeedbackImpl() const { - glTransformFeedbackStreamAttribsNV(num_xfb_attribs, xfb_attribs.data(), num_xfb_strides, - xfb_streams.data(), GL_INTERLEAVED_ATTRIBS); + glTransformFeedbackAttribsNV(num_xfb_attribs, xfb_attribs.data(), GL_SEPARATE_ATTRIBS); } void GraphicsPipeline::GenerateTransformFeedbackState() { // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal // when this is required. GLint* cursor{xfb_attribs.data()}; - GLint* current_stream{xfb_streams.data()}; for (size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) { const auto& layout = key.xfb_state.layouts[feedback]; @@ -575,15 +573,6 @@ void GraphicsPipeline::GenerateTransformFeedbackState() { if (layout.varying_count == 0) { continue; } - *current_stream = static_cast<GLint>(feedback); - if (current_stream != xfb_streams.data()) { - // When stepping one stream, push the expected token - cursor[0] = GL_NEXT_BUFFER_NV; - cursor[1] = 0; - cursor[2] = 0; - cursor += XFB_ENTRY_STRIDE; - } - ++current_stream; const auto& locations = key.xfb_state.varyings[feedback]; std::optional<u32> current_index; @@ -619,7 +608,6 @@ void GraphicsPipeline::GenerateTransformFeedbackState() { } } num_xfb_attribs = static_cast<GLsizei>((cursor - xfb_attribs.data()) / XFB_ENTRY_STRIDE); - num_xfb_strides = static_cast<GLsizei>(current_stream - xfb_streams.data()); } void GraphicsPipeline::WaitForBuild() { diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h index 7b3d7eae8..74fc9cc3d 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h @@ -154,9 +154,7 @@ private: static constexpr std::size_t XFB_ENTRY_STRIDE = 3; GLsizei num_xfb_attribs{}; - GLsizei num_xfb_strides{}; std::array<GLint, 128 * XFB_ENTRY_STRIDE * Maxwell::NumTransformFeedbackBuffers> xfb_attribs{}; - std::array<GLint, Maxwell::NumTransformFeedbackBuffers> xfb_streams{}; std::mutex built_mutex; std::condition_variable built_condvar; diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index 99d7347f5..ec142d48e 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -27,7 +27,7 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) - : QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} + : QueryCacheLegacy(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} QueryCache::~QueryCache() = default; diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index 872513f22..0721e0b3d 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -26,7 +26,7 @@ class RasterizerOpenGL; using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; class QueryCache final - : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { + : public VideoCommon::QueryCacheLegacy<QueryCache, CachedQuery, CounterStream, HostCounter> { public: explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); ~QueryCache(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index aadd6967c..27e2de1bf 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -380,18 +380,55 @@ void RasterizerOpenGL::DispatchCompute() { pipeline->SetEngine(kepler_compute, gpu_memory); pipeline->Configure(); const auto& qmd{kepler_compute->launch_description}; + auto indirect_address = kepler_compute->GetIndirectComputeAddress(); + if (indirect_address) { + // DispatchIndirect + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DiscardWrite; + const auto [buffer, offset] = + buffer_cache.ObtainBuffer(*indirect_address, 12, sync_info, post_op); + glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, buffer->Handle()); + glDispatchComputeIndirect(static_cast<GLintptr>(offset)); + return; + } glDispatchCompute(qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z); ++num_queued_commands; has_written_global_memory |= pipeline->WritesGlobalMemory(); } -void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { - query_cache.ResetCounter(type); +void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) { + if (type == VideoCommon::QueryType::ZPassPixelCount64) { + query_cache.ResetCounter(VideoCore::QueryType::SamplesPassed); + } } -void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, - std::optional<u64> timestamp) { - query_cache.Query(gpu_addr, type, timestamp); +void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { + if (type == VideoCommon::QueryType::ZPassPixelCount64) { + if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { + query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()}); + } else { + query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, std::nullopt); + } + return; + } + if (type != VideoCommon::QueryType::Payload) { + payload = 1u; + } + std::function<void()> func([this, gpu_addr, flags, memory_manager = gpu_memory, payload]() { + if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { + u64 ticks = gpu.GetTicks(); + memory_manager->Write<u64>(gpu_addr + 8, ticks); + memory_manager->Write<u64>(gpu_addr, static_cast<u64>(payload)); + } else { + memory_manager->Write<u32>(gpu_addr, payload); + } + }); + if (True(flags & VideoCommon::QueryPropertiesFlags::IsAFence)) { + SignalFence(std::move(func)); + return; + } + func(); } void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -562,8 +599,8 @@ void RasterizerOpenGL::SignalReference() { fence_manager.SignalOrdering(); } -void RasterizerOpenGL::ReleaseFences() { - fence_manager.WaitPendingFences(); +void RasterizerOpenGL::ReleaseFences(bool force) { + fence_manager.WaitPendingFences(force); } void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size, @@ -1335,7 +1372,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, } const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; - const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing + : VideoCommon::ObtainBufferOperation::MarkAsWritten; const auto [buffer, offset] = buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op); @@ -1344,8 +1382,12 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, const std::span copy_span{©, 1}; if constexpr (IS_IMAGE_UPLOAD) { + texture_cache.PrepareImage(image_id, true, false); image->UploadMemory(buffer->Handle(), offset, copy_span); } else { + if (offset % BytesPerBlock(image->info.format)) { + return false; + } texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span, buffer_operand.address, buffer_size); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 8eda2ddba..ceffe1f1e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -86,8 +86,9 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCore::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; + void ResetCounter(VideoCommon::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -107,7 +108,7 @@ public: void SyncOperation(std::function<void()>&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences() override; + void ReleaseFences(bool force = true) override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 7e1d7f92e..2888e0238 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -445,7 +445,8 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline( ShaderContext::ShaderPools& pools, const GraphicsPipelineKey& key, std::span<Shader::Environment* const> envs, bool use_shader_workers, bool force_context_flush) try { - LOG_INFO(Render_OpenGL, "0x{:016x}", key.Hash()); + auto hash = key.Hash(); + LOG_INFO(Render_OpenGL, "0x{:016x}", hash); size_t env_index{}; u32 total_storage_buffers{}; std::array<Shader::IR::Program, Maxwell::MaxShaderProgram> programs; @@ -474,7 +475,7 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline( Shader::Maxwell::Flow::CFG cfg(env, pools.flow_block, cfg_offset, index == 0); if (Settings::values.dump_shaders) { - env.Dump(key.unique_hashes[index]); + env.Dump(hash, key.unique_hashes[index]); } if (!uses_vertex_a || index != 1) { @@ -522,14 +523,14 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline( const auto runtime_info{ MakeRuntimeInfo(key, program, previous_program, glasm_use_storage_buffers, use_glasm)}; switch (device.GetShaderBackend()) { - case Settings::ShaderBackend::GLSL: + case Settings::ShaderBackend::Glsl: ConvertLegacyToGeneric(program, runtime_info); sources[stage_index] = EmitGLSL(profile, runtime_info, program, binding); break; - case Settings::ShaderBackend::GLASM: + case Settings::ShaderBackend::Glasm: sources[stage_index] = EmitGLASM(profile, runtime_info, program, binding); break; - case Settings::ShaderBackend::SPIRV: + case Settings::ShaderBackend::SpirV: ConvertLegacyToGeneric(program, runtime_info); sources_spirv[stage_index] = EmitSPIRV(profile, runtime_info, program, binding); break; @@ -566,12 +567,13 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline( std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline( ShaderContext::ShaderPools& pools, const ComputePipelineKey& key, Shader::Environment& env, bool force_context_flush) try { - LOG_INFO(Render_OpenGL, "0x{:016x}", key.Hash()); + auto hash = key.Hash(); + LOG_INFO(Render_OpenGL, "0x{:016x}", hash); Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()}; if (Settings::values.dump_shaders) { - env.Dump(key.Hash()); + env.Dump(hash, key.unique_hash); } auto program{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)}; @@ -582,13 +584,13 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline( std::string code{}; std::vector<u32> code_spirv; switch (device.GetShaderBackend()) { - case Settings::ShaderBackend::GLSL: + case Settings::ShaderBackend::Glsl: code = EmitGLSL(profile, program); break; - case Settings::ShaderBackend::GLASM: + case Settings::ShaderBackend::Glasm: code = EmitGLASM(profile, info, program); break; - case Settings::ShaderBackend::SPIRV: + case Settings::ShaderBackend::SpirV: code_spirv = EmitSPIRV(profile, program); break; } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 3b446be07..9cafd2983 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -232,10 +232,9 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info) { if (IsPixelFormatASTC(info.format) && info.size.depth == 1 && !runtime.HasNativeASTC()) { - return Settings::values.accelerate_astc.GetValue() && + return Settings::values.accelerate_astc.GetValue() == Settings::AstcDecodeMode::Gpu && Settings::values.astc_recompression.GetValue() == - Settings::AstcRecompression::Uncompressed && - !Settings::values.async_astc.GetValue(); + Settings::AstcRecompression::Uncompressed; } // Disable other accelerated uploads for now as they don't implement swizzled uploads return false; @@ -267,7 +266,8 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 [[nodiscard]] bool CanBeDecodedAsync(const TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info) { if (IsPixelFormatASTC(info.format) && !runtime.HasNativeASTC()) { - return Settings::values.async_astc.GetValue(); + return Settings::values.accelerate_astc.GetValue() == + Settings::AstcDecodeMode::CpuAsynchronous; } return false; } diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index c7dc7e0a1..5ea9e2378 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -116,6 +116,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9_FLOAT {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // D32_FLOAT {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16_UNORM + {GL_DEPTH_COMPONENT24, GL_DEPTH_COMPONENT, GL_UNSIGNED_INT_24_8}, // X8_D24_UNORM {GL_STENCIL_INDEX8, GL_STENCIL, GL_UNSIGNED_BYTE}, // S8_UINT {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24_UNORM_S8_UINT {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8_UINT_D24_UNORM diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 2a74c1d05..6b8d4e554 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -473,7 +473,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { glBindTextureUnit(0, screen_info.display_texture); auto anti_aliasing = Settings::values.anti_aliasing.GetValue(); - if (anti_aliasing > Settings::AntiAliasing::LastAA) { + if (anti_aliasing >= Settings::AntiAliasing::MaxEnum) { LOG_ERROR(Render_OpenGL, "Invalid antialiasing option selected {}", anti_aliasing); anti_aliasing = Settings::AntiAliasing::None; Settings::values.anti_aliasing.SetValue(anti_aliasing); diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 544982d18..c437013e6 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -68,6 +68,7 @@ void UtilShaders::ASTCDecode(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles) { static constexpr GLuint BINDING_INPUT_BUFFER = 0; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; + program_manager.LocalMemoryWarmup(); const Extent2D tile_size{ .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index 28d4b15a0..f01d2394e 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -3,10 +3,13 @@ #include <algorithm> +#include "video_core/renderer_vulkan/vk_texture_cache.h" + #include "common/settings.h" #include "video_core/host_shaders/blit_color_float_frag_spv.h" #include "video_core/host_shaders/convert_abgr8_to_d24s8_frag_spv.h" #include "video_core/host_shaders/convert_d24s8_to_abgr8_frag_spv.h" +#include "video_core/host_shaders/convert_d32f_to_abgr8_frag_spv.h" #include "video_core/host_shaders/convert_depth_to_float_frag_spv.h" #include "video_core/host_shaders/convert_float_to_depth_frag_spv.h" #include "video_core/host_shaders/convert_s8d24_to_abgr8_frag_spv.h" @@ -14,12 +17,12 @@ #include "video_core/host_shaders/vulkan_blit_depth_stencil_frag_spv.h" #include "video_core/host_shaders/vulkan_color_clear_frag_spv.h" #include "video_core/host_shaders/vulkan_color_clear_vert_spv.h" +#include "video_core/host_shaders/vulkan_depthstencil_clear_frag_spv.h" #include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" -#include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/surface.h" #include "video_core/vulkan_common/vulkan_device.h" @@ -427,9 +430,11 @@ BlitImageHelper::BlitImageHelper(const Device& device_, Scheduler& scheduler_, blit_depth_stencil_frag(BuildShader(device, VULKAN_BLIT_DEPTH_STENCIL_FRAG_SPV)), clear_color_vert(BuildShader(device, VULKAN_COLOR_CLEAR_VERT_SPV)), clear_color_frag(BuildShader(device, VULKAN_COLOR_CLEAR_FRAG_SPV)), + clear_stencil_frag(BuildShader(device, VULKAN_DEPTHSTENCIL_CLEAR_FRAG_SPV)), convert_depth_to_float_frag(BuildShader(device, CONVERT_DEPTH_TO_FLOAT_FRAG_SPV)), convert_float_to_depth_frag(BuildShader(device, CONVERT_FLOAT_TO_DEPTH_FRAG_SPV)), convert_abgr8_to_d24s8_frag(BuildShader(device, CONVERT_ABGR8_TO_D24S8_FRAG_SPV)), + convert_d32f_to_abgr8_frag(BuildShader(device, CONVERT_D32F_TO_ABGR8_FRAG_SPV)), convert_d24s8_to_abgr8_frag(BuildShader(device, CONVERT_D24S8_TO_ABGR8_FRAG_SPV)), convert_s8d24_to_abgr8_frag(BuildShader(device, CONVERT_S8D24_TO_ABGR8_FRAG_SPV)), linear_sampler(device.GetLogical().CreateSampler(SAMPLER_CREATE_INFO<VK_FILTER_LINEAR>)), @@ -554,6 +559,13 @@ void BlitImageHelper::ConvertABGR8ToD24S8(const Framebuffer* dst_framebuffer, Convert(*convert_abgr8_to_d24s8_pipeline, dst_framebuffer, src_image_view); } +void BlitImageHelper::ConvertD32FToABGR8(const Framebuffer* dst_framebuffer, + ImageView& src_image_view) { + ConvertPipelineColorTargetEx(convert_d32f_to_abgr8_pipeline, dst_framebuffer->RenderPass(), + convert_d32f_to_abgr8_frag); + ConvertDepthStencil(*convert_d32f_to_abgr8_pipeline, dst_framebuffer, src_image_view); +} + void BlitImageHelper::ConvertD24S8ToABGR8(const Framebuffer* dst_framebuffer, ImageView& src_image_view) { ConvertPipelineColorTargetEx(convert_d24s8_to_abgr8_pipeline, dst_framebuffer->RenderPass(), @@ -592,6 +604,30 @@ void BlitImageHelper::ClearColor(const Framebuffer* dst_framebuffer, u8 color_ma scheduler.InvalidateState(); } +void BlitImageHelper::ClearDepthStencil(const Framebuffer* dst_framebuffer, bool depth_clear, + f32 clear_depth, u8 stencil_mask, u32 stencil_ref, + u32 stencil_compare_mask, const Region2D& dst_region) { + const BlitDepthStencilPipelineKey key{ + .renderpass = dst_framebuffer->RenderPass(), + .depth_clear = depth_clear, + .stencil_mask = stencil_mask, + .stencil_compare_mask = stencil_compare_mask, + .stencil_ref = stencil_ref, + }; + const VkPipeline pipeline = FindOrEmplaceClearStencilPipeline(key); + const VkPipelineLayout layout = *clear_color_pipeline_layout; + scheduler.RequestRenderpass(dst_framebuffer); + scheduler.Record([pipeline, layout, clear_depth, dst_region](vk::CommandBuffer cmdbuf) { + constexpr std::array blend_constants{0.0f, 0.0f, 0.0f, 0.0f}; + cmdbuf.SetBlendConstants(blend_constants.data()); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + BindBlitState(cmdbuf, dst_region); + cmdbuf.PushConstants(layout, VK_SHADER_STAGE_FRAGMENT_BIT, clear_depth); + cmdbuf.Draw(3, 1, 0, 0); + }); + scheduler.InvalidateState(); +} + void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, const ImageView& src_image_view) { const VkPipelineLayout layout = *one_texture_pipeline_layout; @@ -819,6 +855,61 @@ VkPipeline BlitImageHelper::FindOrEmplaceClearColorPipeline(const BlitImagePipel return *clear_color_pipelines.back(); } +VkPipeline BlitImageHelper::FindOrEmplaceClearStencilPipeline( + const BlitDepthStencilPipelineKey& key) { + const auto it = std::ranges::find(clear_stencil_keys, key); + if (it != clear_stencil_keys.end()) { + return *clear_stencil_pipelines[std::distance(clear_stencil_keys.begin(), it)]; + } + clear_stencil_keys.push_back(key); + const std::array stages = MakeStages(*clear_color_vert, *clear_stencil_frag); + const auto stencil = VkStencilOpState{ + .failOp = VK_STENCIL_OP_KEEP, + .passOp = VK_STENCIL_OP_REPLACE, + .depthFailOp = VK_STENCIL_OP_KEEP, + .compareOp = VK_COMPARE_OP_ALWAYS, + .compareMask = key.stencil_compare_mask, + .writeMask = key.stencil_mask, + .reference = key.stencil_ref, + }; + const VkPipelineDepthStencilStateCreateInfo depth_stencil_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .depthTestEnable = key.depth_clear, + .depthWriteEnable = key.depth_clear, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_TRUE, + .front = stencil, + .back = stencil, + .minDepthBounds = 0.0f, + .maxDepthBounds = 0.0f, + }; + clear_stencil_pipelines.push_back(device.GetLogical().CreateGraphicsPipeline({ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(stages.size()), + .pStages = stages.data(), + .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pTessellationState = nullptr, + .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pDepthStencilState = &depth_stencil_ci, + .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_GENERIC_CREATE_INFO, + .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .layout = *clear_color_pipeline_layout, + .renderPass = key.renderpass, + .subpass = 0, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + })); + return *clear_stencil_pipelines.back(); +} + void BlitImageHelper::ConvertPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass, bool is_target_depth) { if (pipeline) { diff --git a/src/video_core/renderer_vulkan/blit_image.h b/src/video_core/renderer_vulkan/blit_image.h index 2976a7d91..a032c71fb 100644 --- a/src/video_core/renderer_vulkan/blit_image.h +++ b/src/video_core/renderer_vulkan/blit_image.h @@ -27,6 +27,16 @@ struct BlitImagePipelineKey { Tegra::Engines::Fermi2D::Operation operation; }; +struct BlitDepthStencilPipelineKey { + constexpr auto operator<=>(const BlitDepthStencilPipelineKey&) const noexcept = default; + + VkRenderPass renderpass; + bool depth_clear; + u8 stencil_mask; + u32 stencil_compare_mask; + u32 stencil_ref; +}; + class BlitImageHelper { public: explicit BlitImageHelper(const Device& device, Scheduler& scheduler, @@ -57,6 +67,8 @@ public: void ConvertABGR8ToD24S8(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + void ConvertD32FToABGR8(const Framebuffer* dst_framebuffer, ImageView& src_image_view); + void ConvertD24S8ToABGR8(const Framebuffer* dst_framebuffer, ImageView& src_image_view); void ConvertS8D24ToABGR8(const Framebuffer* dst_framebuffer, ImageView& src_image_view); @@ -64,6 +76,10 @@ public: void ClearColor(const Framebuffer* dst_framebuffer, u8 color_mask, const std::array<f32, 4>& clear_color, const Region2D& dst_region); + void ClearDepthStencil(const Framebuffer* dst_framebuffer, bool depth_clear, f32 clear_depth, + u8 stencil_mask, u32 stencil_ref, u32 stencil_compare_mask, + const Region2D& dst_region); + private: void Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, const ImageView& src_image_view); @@ -76,6 +92,8 @@ private: [[nodiscard]] VkPipeline FindOrEmplaceDepthStencilPipeline(const BlitImagePipelineKey& key); [[nodiscard]] VkPipeline FindOrEmplaceClearColorPipeline(const BlitImagePipelineKey& key); + [[nodiscard]] VkPipeline FindOrEmplaceClearStencilPipeline( + const BlitDepthStencilPipelineKey& key); void ConvertPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass, bool is_target_depth); @@ -108,9 +126,11 @@ private: vk::ShaderModule blit_depth_stencil_frag; vk::ShaderModule clear_color_vert; vk::ShaderModule clear_color_frag; + vk::ShaderModule clear_stencil_frag; vk::ShaderModule convert_depth_to_float_frag; vk::ShaderModule convert_float_to_depth_frag; vk::ShaderModule convert_abgr8_to_d24s8_frag; + vk::ShaderModule convert_d32f_to_abgr8_frag; vk::ShaderModule convert_d24s8_to_abgr8_frag; vk::ShaderModule convert_s8d24_to_abgr8_frag; vk::Sampler linear_sampler; @@ -122,11 +142,14 @@ private: std::vector<vk::Pipeline> blit_depth_stencil_pipelines; std::vector<BlitImagePipelineKey> clear_color_keys; std::vector<vk::Pipeline> clear_color_pipelines; + std::vector<BlitDepthStencilPipelineKey> clear_stencil_keys; + std::vector<vk::Pipeline> clear_stencil_pipelines; vk::Pipeline convert_d32_to_r32_pipeline; vk::Pipeline convert_r32_to_d32_pipeline; vk::Pipeline convert_d16_to_r16_pipeline; vk::Pipeline convert_r16_to_d16_pipeline; vk::Pipeline convert_abgr8_to_d24s8_pipeline; + vk::Pipeline convert_d32f_to_abgr8_pipeline; vk::Pipeline convert_d24s8_to_abgr8_pipeline; vk::Pipeline convert_s8d24_to_abgr8_pipeline; }; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index a8540339d..a08f2f67f 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -126,7 +126,7 @@ struct FormatTuple { {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable}, // A1R5G5B5_UNORM {VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10_UNORM {VK_FORMAT_A2B10G10R10_UINT_PACK32, Attachable | Storage}, // A2B10G10R10_UINT - {VK_FORMAT_A2R10G10B10_UNORM_PACK32, Attachable | Storage}, // A2R10G10B10_UNORM + {VK_FORMAT_A2R10G10B10_UNORM_PACK32, Attachable}, // A2R10G10B10_UNORM {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable}, // A1B5G5R5_UNORM (flipped with swizzle) {VK_FORMAT_R5G5B5A1_UNORM_PACK16}, // A5B5G5R1_UNORM (specially swizzled) {VK_FORMAT_R8_UNORM, Attachable | Storage}, // R8_UNORM @@ -185,7 +185,7 @@ struct FormatTuple { {VK_FORMAT_BC2_SRGB_BLOCK}, // BC2_SRGB {VK_FORMAT_BC3_SRGB_BLOCK}, // BC3_SRGB {VK_FORMAT_BC7_SRGB_BLOCK}, // BC7_SRGB - {VK_FORMAT_R4G4B4A4_UNORM_PACK16}, // A4B4G4R4_UNORM + {VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT}, // A4B4G4R4_UNORM {VK_FORMAT_R4G4_UNORM_PACK8}, // G4R4_UNORM {VK_FORMAT_ASTC_4x4_SRGB_BLOCK}, // ASTC_2D_4X4_SRGB {VK_FORMAT_ASTC_8x8_SRGB_BLOCK}, // ASTC_2D_8X8_SRGB @@ -214,8 +214,9 @@ struct FormatTuple { {VK_FORMAT_E5B9G9R9_UFLOAT_PACK32}, // E5B9G9R9_FLOAT // Depth formats - {VK_FORMAT_D32_SFLOAT, Attachable}, // D32_FLOAT - {VK_FORMAT_D16_UNORM, Attachable}, // D16_UNORM + {VK_FORMAT_D32_SFLOAT, Attachable}, // D32_FLOAT + {VK_FORMAT_D16_UNORM, Attachable}, // D16_UNORM + {VK_FORMAT_X8_D24_UNORM_PACK32, Attachable}, // X8_D24_UNORM // Stencil formats {VK_FORMAT_S8_UINT, Attachable}, // S8_UINT diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 454bb66a4..c4c30d807 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -66,21 +66,6 @@ std::string BuildCommaSeparatedExtensions( return fmt::format("{}", fmt::join(available_extensions, ",")); } -DebugCallback MakeDebugCallback(const vk::Instance& instance, const vk::InstanceDispatch& dld) { - if (!Settings::values.renderer_debug) { - return DebugCallback{}; - } - const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); - const auto it = std::ranges::find_if(*properties, [](const auto& prop) { - return std::strcmp(VK_EXT_DEBUG_UTILS_EXTENSION_NAME, prop.extensionName) == 0; - }); - if (it != properties->end()) { - return CreateDebugUtilsCallback(instance); - } else { - return CreateDebugReportCallback(instance); - } -} - } // Anonymous namespace Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, @@ -103,7 +88,8 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, cpu_memory(cpu_memory_), gpu(gpu_), library(OpenLibrary(context.get())), instance(CreateInstance(*library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, Settings::values.renderer_debug.GetValue())), - debug_callback(MakeDebugCallback(instance, dld)), + debug_messenger(Settings::values.renderer_debug ? CreateDebugUtilsCallback(instance) + : vk::DebugUtilsMessenger{}), surface(CreateSurface(instance, render_window.GetWindowInfo())), device(CreateDevice(instance, dld, *surface)), memory_allocator(device), state_tracker(), scheduler(device, state_tracker), diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index ca22c0baa..14e257cf7 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -7,11 +7,12 @@ #include <string> #include <variant> +#include "video_core/renderer_vulkan/vk_rasterizer.h" + #include "common/dynamic_library.h" #include "video_core/renderer_base.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" #include "video_core/renderer_vulkan/vk_present_manager.h" -#include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_swapchain.h" @@ -34,8 +35,6 @@ class GPU; namespace Vulkan { -using DebugCallback = std::variant<vk::DebugUtilsMessenger, vk::DebugReportCallback>; - Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, VkSurfaceKHR surface); @@ -57,10 +56,6 @@ public: return device.GetDriverName(); } - void NotifySurfaceChanged() override { - present_manager.NotifySurfaceChanged(); - } - private: void Report() const; @@ -74,7 +69,7 @@ private: vk::InstanceDispatch dld; vk::Instance instance; - DebugCallback debug_callback; + vk::DebugUtilsMessenger debug_messenger; vk::SurfaceKHR surface; ScreenInfo screen_info; diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index ad3b29f0e..52fc142d1 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -96,6 +96,7 @@ std::size_t GetSizeInBytes(const Tegra::FramebufferConfig& framebuffer) { VkFormat GetFormat(const Tegra::FramebufferConfig& framebuffer) { switch (framebuffer.pixel_format) { case Service::android::PixelFormat::Rgba8888: + case Service::android::PixelFormat::Rgbx8888: return VK_FORMAT_A8B8G8R8_UNORM_PACK32; case Service::android::PixelFormat::Rgb565: return VK_FORMAT_R5G6B5_UNORM_PACK16; @@ -566,7 +567,7 @@ void BlitScreen::CreateDescriptorPool() { const VkDescriptorPoolCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, .pNext = nullptr, - .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .flags = 0, .maxSets = static_cast<u32>(image_count), .poolSizeCount = static_cast<u32>(pool_sizes.size()), .pPoolSizes = pool_sizes.data(), @@ -576,7 +577,7 @@ void BlitScreen::CreateDescriptorPool() { const VkDescriptorPoolCreateInfo ci_aa{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, .pNext = nullptr, - .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .flags = 0, .maxSets = static_cast<u32>(image_count), .poolSizeCount = static_cast<u32>(pool_sizes_aa.size()), .pPoolSizes = pool_sizes_aa.data(), diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index f8cd2a5d8..d8148e89a 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -7,8 +7,9 @@ #include <span> #include <vector> -#include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" + +#include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" @@ -60,6 +61,9 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo if (device.IsExtTransformFeedbackSupported()) { flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } + if (device.IsExtConditionalRendering()) { + flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT; + } const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, @@ -528,17 +532,20 @@ void BufferCacheRuntime::BindVertexBuffers(VideoCommon::HostBindings<Buffer>& bi buffer_handles.push_back(handle); } if (device.IsExtExtendedDynamicStateSupported()) { - scheduler.Record([bindings_ = std::move(bindings), + scheduler.Record([this, bindings_ = std::move(bindings), buffer_handles_ = std::move(buffer_handles)](vk::CommandBuffer cmdbuf) { cmdbuf.BindVertexBuffers2EXT(bindings_.min_index, - bindings_.max_index - bindings_.min_index, + std::min(bindings_.max_index - bindings_.min_index, + device.GetMaxVertexInputBindings()), buffer_handles_.data(), bindings_.offsets.data(), bindings_.sizes.data(), bindings_.strides.data()); }); } else { - scheduler.Record([bindings_ = std::move(bindings), + scheduler.Record([this, bindings_ = std::move(bindings), buffer_handles_ = std::move(buffer_handles)](vk::CommandBuffer cmdbuf) { - cmdbuf.BindVertexBuffers(bindings_.min_index, bindings_.max_index - bindings_.min_index, + cmdbuf.BindVertexBuffers(bindings_.min_index, + std::min(bindings_.max_index - bindings_.min_index, + device.GetMaxVertexInputBindings()), buffer_handles_.data(), bindings_.offsets.data()); }); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 3bc8553e1..617f92910 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -3,20 +3,28 @@ #include <array> #include <memory> +#include <numeric> #include <optional> #include <utility> +#include "video_core/renderer_vulkan/vk_texture_cache.h" + #include "common/assert.h" #include "common/common_types.h" #include "common/div_ceil.h" +#include "common/vector_math.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h" +#include "video_core/host_shaders/convert_msaa_to_non_msaa_comp_spv.h" +#include "video_core/host_shaders/convert_non_msaa_to_msaa_comp_spv.h" +#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" +#include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h" +#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" -#include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/texture_cache/accelerated_swizzle.h" #include "video_core/texture_cache/types.h" @@ -56,6 +64,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE }, }}; +constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 2, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, +}}; + constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .uniform_buffers = 0, .storage_buffers = 2, @@ -66,6 +98,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .score = 2, }; +constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{ + .uniform_buffers = 0, + .storage_buffers = 3, + .texture_buffers = 0, + .image_buffers = 0, + .textures = 0, + .images = 0, + .score = 3, +}; + constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ { .binding = ASTC_BINDING_INPUT_BUFFER, @@ -93,6 +135,33 @@ constexpr DescriptorBankInfo ASTC_BANK_INFO{ .score = 2, }; +constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> MSAA_DESCRIPTOR_SET_BINDINGS{{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, +}}; + +constexpr DescriptorBankInfo MSAA_BANK_INFO{ + .uniform_buffers = 0, + .storage_buffers = 0, + .texture_buffers = 0, + .image_buffers = 0, + .textures = 0, + .images = 2, + .score = 2, +}; + constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{ .dstBinding = 0, .dstArrayElement = 0, @@ -102,6 +171,24 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT .stride = sizeof(DescriptorUpdateEntry), }; +constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{ + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 3, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 0, + .stride = sizeof(DescriptorUpdateEntry), +}; + +constexpr VkDescriptorUpdateTemplateEntry MSAA_DESCRIPTOR_UPDATE_TEMPLATE{ + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 2, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .offset = 0, + .stride = sizeof(DescriptorUpdateEntry), +}; + constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ { @@ -130,13 +217,21 @@ struct AstcPushConstants { u32 block_height; u32 block_height_mask; }; + +struct QueriesPrefixScanPushConstants { + u32 min_accumulation_base; + u32 max_accumulation_base; + u32 accumulation_limit; + u32 buffer_offset; +}; } // Anonymous namespace ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorUpdateTemplateEntry> templates, const DescriptorBankInfo& bank_info, - vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code) + vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, + std::optional<u32> optional_subgroup_size) : device{device_} { descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, @@ -169,6 +264,9 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, }); descriptor_allocator = descriptor_pool.Allocator(*descriptor_set_layout, bank_info); } + if (code.empty()) { + return; + } module = device.GetLogical().CreateShaderModule({ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = nullptr, @@ -177,13 +275,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, .pCode = code.data(), }); device.SaveShader(code); + const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, + .pNext = nullptr, + .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U, + }; + bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size; pipeline = device.GetLogical().CreateComputePipeline({ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .pNext = nullptr, .flags = 0, .stage{ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, - .pNext = nullptr, + .pNext = use_setup_size ? &subgroup_size_ci : nullptr, .flags = 0, .stage = VK_SHADER_STAGE_COMPUTE_BIT, .module = *module, @@ -301,6 +405,123 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble( return {staging.buffer, staging.offset}; } +ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS, + INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr, + RESOLVE_CONDITIONAL_RENDER_COMP_SPV), + scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} + +void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, + u32 src_offset, bool compare_to_zero) { + const size_t compare_size = compare_to_zero ? 8 : 24; + + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.Dispatch(1, 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); + }); +} + +QueriesPrefixScanPass::QueriesPrefixScanPass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass( + device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, + QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, + COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) + ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) + : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)), + scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} + +void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, + VkBuffer src_buffer, size_t number_of_sums, + size_t min_accumulation_limit, size_t max_accumulation_limit) { + size_t current_runs = number_of_sums; + size_t offset = 0; + while (current_runs != 0) { + static constexpr size_t DISPATCH_SIZE = 2048U; + size_t runs_to_do = std::min<size_t>(current_runs, DISPATCH_SIZE); + current_runs -= runs_to_do; + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + size_t used_offset = offset; + offset += runs_to_do; + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit, + runs_to_do, used_offset](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | + VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | + VK_ACCESS_UNIFORM_READ_BIT | + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const QueriesPrefixScanPushConstants uniforms{ + .min_accumulation_base = static_cast<u32>(min_accumulation_limit), + .max_accumulation_base = static_cast<u32>(max_accumulation_limit), + .accumulation_limit = static_cast<u32>(runs_to_do - 1), + .buffer_offset = static_cast<u32>(used_offset), + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); + cmdbuf.Dispatch(1, 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, + write_barrier); + }); + } +} + ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, @@ -412,4 +633,100 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, scheduler.Finish(); } +MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, + StagingBufferPool& staging_buffer_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass(device_, descriptor_pool_, MSAA_DESCRIPTOR_SET_BINDINGS, + MSAA_DESCRIPTOR_UPDATE_TEMPLATE, MSAA_BANK_INFO, {}, + CONVERT_NON_MSAA_TO_MSAA_COMP_SPV), + scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_}, + compute_pass_descriptor_queue{compute_pass_descriptor_queue_} { + const auto make_msaa_pipeline = [this](size_t i, std::span<const u32> code) { + modules[i] = device.GetLogical().CreateShaderModule({ + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .codeSize = static_cast<u32>(code.size_bytes()), + .pCode = code.data(), + }); + pipelines[i] = device.GetLogical().CreateComputePipeline({ + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = *modules[i], + .pName = "main", + .pSpecializationInfo = nullptr, + }, + .layout = *layout, + .basePipelineHandle = nullptr, + .basePipelineIndex = 0, + }); + }; + make_msaa_pipeline(0, CONVERT_NON_MSAA_TO_MSAA_COMP_SPV); + make_msaa_pipeline(1, CONVERT_MSAA_TO_NON_MSAA_COMP_SPV); +} + +MSAACopyPass::~MSAACopyPass() = default; + +void MSAACopyPass::CopyImage(Image& dst_image, Image& src_image, + std::span<const VideoCommon::ImageCopy> copies, + bool msaa_to_non_msaa) { + const VkPipeline msaa_pipeline = *pipelines[msaa_to_non_msaa ? 1 : 0]; + scheduler.RequestOutsideRenderPassOperationContext(); + for (const VideoCommon::ImageCopy& copy : copies) { + ASSERT(copy.src_subresource.base_layer == 0); + ASSERT(copy.src_subresource.num_layers == 1); + ASSERT(copy.dst_subresource.base_layer == 0); + ASSERT(copy.dst_subresource.num_layers == 1); + + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddImage( + src_image.StorageImageView(copy.src_subresource.base_level)); + compute_pass_descriptor_queue.AddImage( + dst_image.StorageImageView(copy.dst_subresource.base_level)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + + const Common::Vec3<u32> num_dispatches = { + Common::DivCeil(copy.extent.width, 8U), + Common::DivCeil(copy.extent.height, 8U), + copy.extent.depth, + }; + + scheduler.Record([this, dst = dst_image.Handle(), msaa_pipeline, num_dispatches, + descriptor_data](vk::CommandBuffer cmdbuf) { + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, msaa_pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.Dispatch(num_dispatches.x, num_dispatches.y, num_dispatches.z); + const VkImageMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, write_barrier); + }); + } +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index dd3927376..7b8f938c1 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -3,6 +3,7 @@ #pragma once +#include <optional> #include <span> #include <utility> @@ -10,6 +11,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" +#include "video_core/texture_cache/types.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" @@ -31,7 +33,8 @@ public: vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorUpdateTemplateEntry> templates, const DescriptorBankInfo& bank_info, - vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); + vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, + std::optional<u32> optional_subgroup_size = std::nullopt); ~ComputePass(); protected: @@ -82,6 +85,33 @@ private: ComputePassDescriptorQueue& compute_pass_descriptor_queue; }; +class ConditionalRenderingResolvePass final : public ComputePass { +public: + explicit ConditionalRenderingResolvePass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + + void Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero); + +private: + Scheduler& scheduler; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; +}; + +class QueriesPrefixScanPass final : public ComputePass { +public: + explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + + void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, + size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit); + +private: + Scheduler& scheduler; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; +}; + class ASTCDecoderPass final : public ComputePass { public: explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, @@ -101,4 +131,22 @@ private: MemoryAllocator& memory_allocator; }; +class MSAACopyPass final : public ComputePass { +public: + explicit MSAACopyPass(const Device& device_, Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + ~MSAACopyPass(); + + void CopyImage(Image& dst_image, Image& src_image, + std::span<const VideoCommon::ImageCopy> copies, bool msaa_to_non_msaa); + +private: + Scheduler& scheduler; + StagingBufferPool& staging_buffer_pool; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; + std::array<vk::ShaderModule, 2> modules; + std::array<vk::Pipeline, 2> pipelines; +}; + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp index b5ae6443c..6048a301f 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp @@ -77,7 +77,7 @@ static void AllocatePool(const Device& device, DescriptorBank& bank) { bank.pools.push_back(device.GetLogical().CreateDescriptorPool({ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, .pNext = nullptr, - .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .flags = 0, .maxSets = sets_per_pool, .poolSizeCount = static_cast<u32>(pool_cursor), .pPoolSizes = std::data(pool_sizes), diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 145359d4e..336573574 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -7,6 +7,7 @@ #include "video_core/fence_manager.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" namespace Core { @@ -20,7 +21,6 @@ class RasterizerInterface; namespace Vulkan { class Device; -class QueryCache; class Scheduler; class InnerFence : public VideoCommon::FenceBase { diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp index 9bcdca2fb..ce8f3f3c2 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.cpp +++ b/src/video_core/renderer_vulkan/vk_fsr.cpp @@ -150,7 +150,7 @@ void FSR::CreateDescriptorPool() { const VkDescriptorPoolCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, .pNext = nullptr, - .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .flags = 0, .maxSets = static_cast<u32>(image_count * 2), .poolSizeCount = static_cast<u32>(pool_sizes.size()), .pPoolSizes = pool_sizes.data(), diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index ad35cacac..f2fd2670f 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -7,9 +7,10 @@ #include <boost/container/small_vector.hpp> #include <boost/container/static_vector.hpp> +#include "video_core/renderer_vulkan/pipeline_helper.h" + #include "common/bit_field.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/pipeline_helper.h" #include "video_core/renderer_vulkan/pipeline_statistics.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 4f84d8497..a1ec1a100 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -294,10 +294,11 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device texture_cache{texture_cache_}, shader_notify{shader_notify_}, use_asynchronous_shaders{Settings::values.use_asynchronous_shaders.GetValue()}, use_vulkan_pipeline_cache{Settings::values.use_vulkan_driver_pipeline_cache.GetValue()}, - workers(device.GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY - ? 1 - : (std::max(std::thread::hardware_concurrency(), 2U) - 1), - "VkPipelineBuilder"), +#ifdef ANDROID + workers(1, "VkPipelineBuilder"), +#else + workers(std::max(std::thread::hardware_concurrency(), 2U) - 1, "VkPipelineBuilder"), +#endif serialization_thread(1, "VkPipelineSerialization") { const auto& float_control{device.FloatControlProperties()}; const VkDriverId driver_id{device.GetDriverID()}; @@ -584,7 +585,8 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline( ShaderPools& pools, const GraphicsPipelineCacheKey& key, std::span<Shader::Environment* const> envs, PipelineStatistics* statistics, bool build_in_parallel) try { - LOG_INFO(Render_Vulkan, "0x{:016x}", key.Hash()); + auto hash = key.Hash(); + LOG_INFO(Render_Vulkan, "0x{:016x}", hash); size_t env_index{0}; std::array<Shader::IR::Program, Maxwell::MaxShaderProgram> programs; const bool uses_vertex_a{key.unique_hashes[0] != 0}; @@ -610,9 +612,6 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline( const u32 cfg_offset{static_cast<u32>(env.StartAddress() + sizeof(Shader::ProgramHeader))}; Shader::Maxwell::Flow::CFG cfg(env, pools.flow_block, cfg_offset, index == 0); - if (Settings::values.dump_shaders) { - env.Dump(key.unique_hashes[index]); - } if (!uses_vertex_a || index != 1) { // Normal path programs[index] = TranslateProgram(pools.inst, pools.block, env, cfg, host_info); @@ -623,6 +622,10 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline( programs[index] = MergeDualVertexPrograms(program_va, program_vb, env); } + if (Settings::values.dump_shaders) { + env.Dump(hash, key.unique_hashes[index]); + } + if (programs[index].info.requires_layer_emulation) { layer_source_program = &programs[index]; } @@ -663,6 +666,19 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline( std::move(modules), infos); } catch (const Shader::Exception& exception) { + auto hash = key.Hash(); + size_t env_index{0}; + for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { + if (key.unique_hashes[index] == 0) { + continue; + } + Shader::Environment& env{*envs[env_index]}; + ++env_index; + + const u32 cfg_offset{static_cast<u32>(env.StartAddress() + sizeof(Shader::ProgramHeader))}; + Shader::Maxwell::Flow::CFG cfg(env, pools.flow_block, cfg_offset, index == 0); + env.Dump(hash, key.unique_hashes[index]); + } LOG_ERROR(Render_Vulkan, "{}", exception.what()); return nullptr; } @@ -712,18 +728,19 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline( std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline( ShaderPools& pools, const ComputePipelineCacheKey& key, Shader::Environment& env, PipelineStatistics* statistics, bool build_in_parallel) try { + auto hash = key.Hash(); if (device.HasBrokenCompute()) { - LOG_ERROR(Render_Vulkan, "Skipping 0x{:016x}", key.Hash()); + LOG_ERROR(Render_Vulkan, "Skipping 0x{:016x}", hash); return nullptr; } - LOG_INFO(Render_Vulkan, "0x{:016x}", key.Hash()); + LOG_INFO(Render_Vulkan, "0x{:016x}", hash); Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()}; // Dump it before error. if (Settings::values.dump_shaders) { - env.Dump(key.Hash()); + env.Dump(hash, key.unique_hash); } auto program{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)}; diff --git a/src/video_core/renderer_vulkan/vk_present_manager.cpp b/src/video_core/renderer_vulkan/vk_present_manager.cpp index d681bd22a..2ef36583b 100644 --- a/src/video_core/renderer_vulkan/vk_present_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_present_manager.cpp @@ -103,8 +103,7 @@ PresentManager::PresentManager(const vk::Instance& instance_, surface{surface_}, blit_supported{CanBlitToSwapchain(device.GetPhysical(), swapchain.GetImageViewFormat())}, use_present_thread{Settings::values.async_presentation.GetValue()}, - image_count{swapchain.GetImageCount()}, last_render_surface{ - render_window_.GetWindowInfo().render_surface} { + image_count{swapchain.GetImageCount()} { auto& dld = device.GetLogical(); cmdpool = dld.CreateCommandPool({ @@ -289,44 +288,36 @@ void PresentManager::PresentThread(std::stop_token token) { } } -void PresentManager::NotifySurfaceChanged() { -#ifdef ANDROID - std::scoped_lock lock{recreate_surface_mutex}; - recreate_surface_cv.notify_one(); -#endif +void PresentManager::RecreateSwapchain(Frame* frame) { + swapchain.Create(*surface, frame->width, frame->height, frame->is_srgb); + image_count = swapchain.GetImageCount(); } void PresentManager::CopyToSwapchain(Frame* frame) { - MICROPROFILE_SCOPE(Vulkan_CopyToSwapchain); - - const auto recreate_swapchain = [&] { - swapchain.Create(*surface, frame->width, frame->height, frame->is_srgb); - image_count = swapchain.GetImageCount(); - }; - -#ifdef ANDROID - std::unique_lock lock{recreate_surface_mutex}; - - const auto needs_recreation = [&] { - if (last_render_surface != render_window.GetWindowInfo().render_surface) { - return true; - } - if (swapchain.NeedsRecreation(frame->is_srgb)) { - return true; + bool requires_recreation = false; + + while (true) { + try { + // Recreate surface and swapchain if needed. + if (requires_recreation) { + surface = CreateSurface(instance, render_window.GetWindowInfo()); + RecreateSwapchain(frame); + } + + // Draw to swapchain. + return CopyToSwapchainImpl(frame); + } catch (const vk::Exception& except) { + if (except.GetResult() != VK_ERROR_SURFACE_LOST_KHR) { + throw; + } + + requires_recreation = true; } - return false; - }; - - recreate_surface_cv.wait_for(lock, std::chrono::milliseconds(400), - [&]() { return !needs_recreation(); }); - - // If the frontend recreated the surface, recreate the renderer surface and swapchain. - if (last_render_surface != render_window.GetWindowInfo().render_surface) { - last_render_surface = render_window.GetWindowInfo().render_surface; - surface = CreateSurface(instance, render_window.GetWindowInfo()); - recreate_swapchain(); } -#endif +} + +void PresentManager::CopyToSwapchainImpl(Frame* frame) { + MICROPROFILE_SCOPE(Vulkan_CopyToSwapchain); // If the size or colorspace of the incoming frames has changed, recreate the swapchain // to account for that. @@ -334,11 +325,11 @@ void PresentManager::CopyToSwapchain(Frame* frame) { const bool size_changed = swapchain.GetWidth() != frame->width || swapchain.GetHeight() != frame->height; if (srgb_changed || size_changed) { - recreate_swapchain(); + RecreateSwapchain(frame); } while (swapchain.AcquireNextImage()) { - recreate_swapchain(); + RecreateSwapchain(frame); } const vk::CommandBuffer cmdbuf{frame->cmdbuf}; @@ -488,4 +479,4 @@ void PresentManager::CopyToSwapchain(Frame* frame) { swapchain.Present(render_semaphore); } -} // namespace Vulkan
\ No newline at end of file +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_present_manager.h b/src/video_core/renderer_vulkan/vk_present_manager.h index 83e859416..a3d825fe6 100644 --- a/src/video_core/renderer_vulkan/vk_present_manager.h +++ b/src/video_core/renderer_vulkan/vk_present_manager.h @@ -54,14 +54,15 @@ public: /// Waits for the present thread to finish presenting all queued frames. void WaitPresent(); - /// This is called to notify the rendering backend of a surface change - void NotifySurfaceChanged(); - private: void PresentThread(std::stop_token token); void CopyToSwapchain(Frame* frame); + void CopyToSwapchainImpl(Frame* frame); + + void RecreateSwapchain(Frame* frame); + private: const vk::Instance& instance; Core::Frontend::EmuWindow& render_window; @@ -76,16 +77,13 @@ private: std::queue<Frame*> free_queue; std::condition_variable_any frame_cv; std::condition_variable free_cv; - std::condition_variable recreate_surface_cv; std::mutex swapchain_mutex; - std::mutex recreate_surface_mutex; std::mutex queue_mutex; std::mutex free_mutex; std::jthread present_thread; bool blit_supported; bool use_present_thread; std::size_t image_count{}; - void* last_render_surface{}; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 29e0b797b..2edaafa7e 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -1,139 +1,1555 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later -#include <algorithm> #include <cstddef> +#include <limits> +#include <map> +#include <memory> +#include <span> +#include <type_traits> +#include <unordered_map> #include <utility> #include <vector> +#include "common/bit_util.h" +#include "common/common_types.h" +#include "core/memory.h" +#include "video_core/engines/draw_manager.h" +#include "video_core/query_cache/query_cache.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -using VideoCore::QueryType; +using Tegra::Engines::Maxwell3D; +using VideoCommon::QueryType; namespace { +class SamplesQueryBank : public VideoCommon::BankBase { +public: + static constexpr size_t BANK_SIZE = 256; + static constexpr size_t QUERY_SIZE = 8; + explicit SamplesQueryBank(const Device& device_, size_t index_) + : BankBase(BANK_SIZE), device{device_}, index{index_} { + const auto& dev = device.GetLogical(); + query_pool = dev.CreateQueryPool({ + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .queryType = VK_QUERY_TYPE_OCCLUSION, + .queryCount = BANK_SIZE, + .pipelineStatistics = 0, + }); + Reset(); + } -constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION}; + ~SamplesQueryBank() = default; -constexpr VkQueryType GetTarget(QueryType type) { - return QUERY_TARGETS[static_cast<std::size_t>(type)]; -} + void Reset() override { + ASSERT(references == 0); + VideoCommon::BankBase::Reset(); + const auto& dev = device.GetLogical(); + dev.ResetQueryPool(*query_pool, 0, BANK_SIZE); + host_results.fill(0ULL); + next_bank = 0; + } + + void Sync(size_t start, size_t size) { + const auto& dev = device.GetLogical(); + const VkResult query_result = dev.GetQueryResults( + *query_pool, static_cast<u32>(start), static_cast<u32>(size), sizeof(u64) * size, + &host_results[start], sizeof(u64), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + switch (query_result) { + case VK_SUCCESS: + return; + case VK_ERROR_DEVICE_LOST: + device.ReportLoss(); + [[fallthrough]]; + default: + throw vk::Exception(query_result); + } + } + + VkQueryPool GetInnerPool() { + return *query_pool; + } + + size_t GetIndex() const { + return index; + } + + const std::array<u64, BANK_SIZE>& GetResults() const { + return host_results; + } + + size_t next_bank; + +private: + const Device& device; + const size_t index; + vk::QueryPool query_pool; + std::array<u64, BANK_SIZE> host_results; +}; + +using BaseStreamer = VideoCommon::SimpleStreamer<VideoCommon::HostQueryBase>; + +struct HostSyncValues { + VAddr address; + size_t size; + size_t offset; + + static constexpr bool GeneratesBaseBuffer = false; +}; + +class SamplesStreamer : public BaseStreamer { +public: + explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, + VideoCore::RasterizerInterface* rasterizer_, const Device& device_, + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) + : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, + scheduler{scheduler_}, memory_allocator{memory_allocator_} { + current_bank = nullptr; + current_query = nullptr; + ammend_value = 0; + acumulation_value = 0; + queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>( + device, scheduler, descriptor_pool, compute_pass_descriptor_queue); + + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = 8, + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); + }); + } + + ~SamplesStreamer() = default; + + void StartCounter() override { + if (has_started) { + return; + } + ReserveHostQuery(); + scheduler.Record([query_pool = current_query_pool, + query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { + const bool use_precise = Settings::IsGPULevelHigh(); + cmdbuf.BeginQuery(query_pool, static_cast<u32>(query_index), + use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); + }); + has_started = true; + } + + void PauseCounter() override { + if (!has_started) { + return; + } + scheduler.Record([query_pool = current_query_pool, + query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { + cmdbuf.EndQuery(query_pool, static_cast<u32>(query_index)); + }); + has_started = false; + } + + void ResetCounter() override { + if (has_started) { + PauseCounter(); + } + AbandonCurrentQuery(); + std::function<void()> func([this, counts = pending_flush_queries.size()] { + ammend_value = 0; + acumulation_value = 0; + }); + rasterizer->SyncOperation(std::move(func)); + accumulation_since_last_sync = false; + first_accumulation_checkpoint = std::min(first_accumulation_checkpoint, num_slots_used); + last_accumulation_checkpoint = std::max(last_accumulation_checkpoint, num_slots_used); + } + + void CloseCounter() override { + PauseCounter(); + } -} // Anonymous namespace + bool HasPendingSync() const override { + return !pending_sync.empty(); + } + + void SyncWrites() override { + if (sync_values_stash.empty()) { + return; + } -QueryPool::QueryPool(const Device& device_, Scheduler& scheduler, QueryType type_) - : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {} + for (size_t i = 0; i < sync_values_stash.size(); i++) { + runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], + *buffers[resolve_buffers[i]]); + } + + sync_values_stash.clear(); + } -QueryPool::~QueryPool() = default; + void PresyncWrites() override { + if (pending_sync.empty()) { + return; + } + PauseCounter(); + sync_values_stash.clear(); + sync_values_stash.emplace_back(); + std::vector<HostSyncValues>* sync_values = &sync_values_stash.back(); + sync_values->reserve(num_slots_used); + std::unordered_map<size_t, std::pair<size_t, size_t>> offsets; + resolve_buffers.clear(); + size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used); + resolve_buffers.push_back(resolve_buffer_index); + size_t base_offset = 0; -std::pair<VkQueryPool, u32> QueryPool::Commit() { - std::size_t index; - do { - index = CommitResource(); - } while (usage[index]); - usage[index] = true; + ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start, + size_t amount) { + size_t bank_id = bank->GetIndex(); + auto& resolve_buffer = buffers[resolve_buffer_index]; + VkQueryPool query_pool = bank->GetInnerPool(); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([start, amount, base_offset, query_pool, + buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { + const VkBufferMemoryBarrier copy_query_pool_barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = buffer, + .offset = base_offset, + .size = amount * SamplesQueryBank::QUERY_SIZE, + }; + + cmdbuf.CopyQueryPoolResults( + query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer, + static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE, + VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); + }); + offsets[bank_id] = {start, base_offset}; + base_offset += amount * SamplesQueryBank::QUERY_SIZE; + }); + + // Convert queries + bool has_multi_queries = false; + for (auto q : pending_sync) { + auto* query = GetQuery(q); + size_t sync_value_slot = 0; + if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { + continue; + } + if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { + continue; + } + if (accumulation_since_last_sync || query->size_slots > 1) { + if (!has_multi_queries) { + has_multi_queries = true; + sync_values_stash.emplace_back(); + } + sync_value_slot = 1; + } + query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; + auto loc_data = offsets[query->start_bank_id]; + sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{ + .address = query->guest_address, + .size = SamplesQueryBank::QUERY_SIZE, + .offset = + loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) * + SamplesQueryBank::QUERY_SIZE, + }); + } + + if (has_multi_queries) { + size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used); + resolve_buffers.push_back(intermediary_buffer_index); + queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], + *buffers[resolve_buffer_index], num_slots_used, + std::min(first_accumulation_checkpoint, num_slots_used), + last_accumulation_checkpoint); + + } else { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); + }); + } + + ReplicateCurrentQueryIfNeeded(); + std::function<void()> func([this] { ammend_value = acumulation_value; }); + rasterizer->SyncOperation(std::move(func)); + AbandonCurrentQuery(); + num_slots_used = 0; + first_accumulation_checkpoint = std::numeric_limits<size_t>::max(); + last_accumulation_checkpoint = 0; + accumulation_since_last_sync = has_multi_queries; + pending_sync.clear(); + } - return {*pools[index / GROW_STEP], static_cast<u32>(index % GROW_STEP)}; + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + [[maybe_unused]] std::optional<u32> subreport) override { + PauseCounter(); + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = address; + new_query->value = 0; + new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; + if (has_timestamp) { + new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + if (!current_query) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + new_query->start_bank_id = current_query->start_bank_id; + new_query->size_banks = current_query->size_banks; + new_query->start_slot = current_query->start_slot; + new_query->size_slots = current_query->size_slots; + ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { + bank->AddReference(amount); + }); + pending_sync.push_back(index); + pending_flush_queries.push_back(index); + return index; + } + + bool HasUnsyncedQueries() const override { + return !pending_flush_queries.empty(); + } + + void PushUnsyncedQueries() override { + PauseCounter(); + current_bank->Close(); + { + std::scoped_lock lk(flush_guard); + pending_flush_sets.emplace_back(std::move(pending_flush_queries)); + } + } + + void PopUnsyncedQueries() override { + std::vector<size_t> current_flush_queries; + { + std::scoped_lock lk(flush_guard); + current_flush_queries = std::move(pending_flush_sets.front()); + pending_flush_sets.pop_front(); + } + ApplyBanksWideOp<false>( + current_flush_queries, + [](SamplesQueryBank* bank, size_t start, size_t amount) { bank->Sync(start, amount); }); + for (auto q : current_flush_queries) { + auto* query = GetQuery(q); + u64 total = 0; + ApplyBankOp(query, [&total](SamplesQueryBank* bank, size_t start, size_t amount) { + const auto& results = bank->GetResults(); + for (size_t i = 0; i < amount; i++) { + total += results[start + i]; + } + }); + query->value = total; + query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + } + } + +private: + template <typename Func> + void ApplyBankOp(VideoCommon::HostQueryBase* query, Func&& func) { + size_t size_slots = query->size_slots; + if (size_slots == 0) { + return; + } + size_t bank_id = query->start_bank_id; + size_t banks_set = query->size_banks; + size_t start_slot = query->start_slot; + for (size_t i = 0; i < banks_set; i++) { + auto& the_bank = bank_pool.GetBank(bank_id); + size_t amount = std::min(the_bank.Size() - start_slot, size_slots); + func(&the_bank, start_slot, amount); + bank_id = the_bank.next_bank - 1; + start_slot = 0; + size_slots -= amount; + } + } + + template <bool is_ordered, typename Func> + void ApplyBanksWideOp(std::vector<size_t>& queries, Func&& func) { + std::conditional_t<is_ordered, std::map<size_t, std::pair<size_t, size_t>>, + std::unordered_map<size_t, std::pair<size_t, size_t>>> + indexer; + for (auto q : queries) { + auto* query = GetQuery(q); + ApplyBankOp(query, [&indexer](SamplesQueryBank* bank, size_t start, size_t amount) { + auto id_ = bank->GetIndex(); + auto pair = indexer.try_emplace(id_, std::numeric_limits<size_t>::max(), + std::numeric_limits<size_t>::min()); + auto& current_pair = pair.first->second; + current_pair.first = std::min(current_pair.first, start); + current_pair.second = std::max(current_pair.second, amount + start); + }); + } + for (auto& cont : indexer) { + func(&bank_pool.GetBank(cont.first), cont.second.first, + cont.second.second - cont.second.first); + } + } + + void ReserveBank() { + current_bank_id = + bank_pool.ReserveBank([this](std::deque<SamplesQueryBank>& queue, size_t index) { + queue.emplace_back(device, index); + }); + if (current_bank) { + current_bank->next_bank = current_bank_id + 1; + } + current_bank = &bank_pool.GetBank(current_bank_id); + current_query_pool = current_bank->GetInnerPool(); + } + + size_t ReserveBankSlot() { + if (!current_bank || current_bank->IsClosed()) { + ReserveBank(); + } + auto [built, index] = current_bank->Reserve(); + current_bank_slot = index; + return index; + } + + void ReserveHostQuery() { + size_t new_slot = ReserveBankSlot(); + current_bank->AddReference(1); + num_slots_used++; + if (current_query) { + size_t bank_id = current_query->start_bank_id; + size_t banks_set = current_query->size_banks - 1; + bool found = bank_id == current_bank_id; + while (!found && banks_set > 0) { + SamplesQueryBank& some_bank = bank_pool.GetBank(bank_id); + bank_id = some_bank.next_bank - 1; + found = bank_id == current_bank_id; + banks_set--; + } + if (!found) { + current_query->size_banks++; + } + current_query->size_slots++; + } else { + current_query_id = BuildQuery(); + current_query = GetQuery(current_query_id); + current_query->start_bank_id = static_cast<u32>(current_bank_id); + current_query->size_banks = 1; + current_query->start_slot = new_slot; + current_query->size_slots = 1; + } + } + + void Free(size_t query_id) override { + std::scoped_lock lk(guard); + auto* query = GetQuery(query_id); + ApplyBankOp(query, [](SamplesQueryBank* bank, size_t start, size_t amount) { + bank->CloseReference(amount); + }); + ReleaseQuery(query_id); + } + + void AbandonCurrentQuery() { + if (!current_query) { + return; + } + Free(current_query_id); + current_query = nullptr; + current_query_id = 0; + } + + void ReplicateCurrentQueryIfNeeded() { + if (pending_sync.empty()) { + return; + } + if (!current_query) { + return; + } + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = 0; + new_query->value = 0; + new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; + new_query->start_bank_id = current_query->start_bank_id; + new_query->size_banks = current_query->size_banks; + new_query->start_slot = current_query->start_slot; + new_query->size_slots = current_query->size_slots; + ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { + bank->AddReference(amount); + }); + pending_flush_queries.push_back(index); + std::function<void()> func([this, index] { + auto* query = GetQuery(index); + query->value += GetAmmendValue(); + SetAccumulationValue(query->value); + Free(index); + }); + rasterizer->SyncOperation(std::move(func)); + } + + template <bool is_resolve> + size_t ObtainBuffer(size_t num_needed) { + const size_t log_2 = std::max<size_t>(11U, Common::Log2Ceil64(num_needed)); + if constexpr (is_resolve) { + if (resolve_table[log_2] != 0) { + return resolve_table[log_2] - 1; + } + } else { + if (intermediary_table[log_2] != 0) { + return intermediary_table[log_2] - 1; + } + } + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2), + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); + if constexpr (is_resolve) { + resolve_table[log_2] = buffers.size(); + } else { + intermediary_table[log_2] = buffers.size(); + } + return buffers.size() - 1; + } + + QueryCacheRuntime& runtime; + VideoCore::RasterizerInterface* rasterizer; + const Device& device; + Scheduler& scheduler; + const MemoryAllocator& memory_allocator; + VideoCommon::BankPool<SamplesQueryBank> bank_pool; + std::deque<vk::Buffer> buffers; + std::array<size_t, 32> resolve_table{}; + std::array<size_t, 32> intermediary_table{}; + vk::Buffer accumulation_buffer; + std::deque<std::vector<HostSyncValues>> sync_values_stash; + std::vector<size_t> resolve_buffers; + + // syncing queue + std::vector<size_t> pending_sync; + + // flush levels + std::vector<size_t> pending_flush_queries; + std::deque<std::vector<size_t>> pending_flush_sets; + + // State Machine + size_t current_bank_slot; + size_t current_bank_id; + SamplesQueryBank* current_bank; + VkQueryPool current_query_pool; + size_t current_query_id; + size_t num_slots_used{}; + size_t first_accumulation_checkpoint{}; + size_t last_accumulation_checkpoint{}; + bool accumulation_since_last_sync{}; + VideoCommon::HostQueryBase* current_query; + bool has_started{}; + std::mutex flush_guard; + + std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass; +}; + +// Transform feedback queries +class TFBQueryBank : public VideoCommon::BankBase { +public: + static constexpr size_t BANK_SIZE = 1024; + static constexpr size_t QUERY_SIZE = 4; + explicit TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator, + size_t index_) + : BankBase(BANK_SIZE), scheduler{scheduler_}, index{index_} { + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = QUERY_SIZE * BANK_SIZE, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + } + + ~TFBQueryBank() = default; + + void Reset() override { + ASSERT(references == 0); + VideoCommon::BankBase::Reset(); + } + + void Sync(StagingBufferRef& stagging_buffer, size_t extra_offset, size_t start, size_t size) { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, dst_buffer = stagging_buffer.buffer, extra_offset, start, + size](vk::CommandBuffer cmdbuf) { + std::array<VkBufferCopy, 1> copy{VkBufferCopy{ + .srcOffset = start * QUERY_SIZE, + .dstOffset = extra_offset, + .size = size * QUERY_SIZE, + }}; + cmdbuf.CopyBuffer(*buffer, dst_buffer, copy); + }); + } + + size_t GetIndex() const { + return index; + } + + VkBuffer GetBuffer() const { + return *buffer; + } + +private: + Scheduler& scheduler; + const size_t index; + vk::Buffer buffer; +}; + +class PrimitivesSucceededStreamer; + +class TFBCounterStreamer : public BaseStreamer { +public: + explicit TFBCounterStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_, + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, + StagingBufferPool& staging_pool_) + : BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_}, + memory_allocator{memory_allocator_}, staging_pool{staging_pool_} { + buffers_count = 0; + current_bank = nullptr; + counter_buffers.fill(VK_NULL_HANDLE); + offsets.fill(0); + last_queries.fill(0); + last_queries_stride.fill(1); + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = TFBQueryBank::QUERY_SIZE * NUM_STREAMS, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + + counters_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + for (auto& c : counter_buffers) { + c = *counters_buffer; + } + size_t base_offset = 0; + for (auto& o : offsets) { + o = base_offset; + base_offset += TFBQueryBank::QUERY_SIZE; + } + } + + ~TFBCounterStreamer() = default; + + void StartCounter() override { + FlushBeginTFB(); + has_started = true; + } + + void PauseCounter() override { + CloseCounter(); + } + + void ResetCounter() override { + CloseCounter(); + } + + void CloseCounter() override { + if (has_flushed_end_pending) { + FlushEndTFB(); + } + runtime.View3DRegs([this](Maxwell3D& maxwell3d) { + if (maxwell3d.regs.transform_feedback_enabled == 0) { + streams_mask = 0; + has_started = false; + } + }); + } + + bool HasPendingSync() const override { + return !pending_sync.empty(); + } + + void SyncWrites() override { + CloseCounter(); + std::unordered_map<size_t, std::vector<HostSyncValues>> sync_values_stash; + for (auto q : pending_sync) { + auto* query = GetQuery(q); + if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { + continue; + } + if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { + continue; + } + query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; + sync_values_stash.try_emplace(query->start_bank_id); + sync_values_stash[query->start_bank_id].emplace_back(HostSyncValues{ + .address = query->guest_address, + .size = TFBQueryBank::QUERY_SIZE, + .offset = query->start_slot * TFBQueryBank::QUERY_SIZE, + }); + } + for (auto& p : sync_values_stash) { + auto& bank = bank_pool.GetBank(p.first); + runtime.template SyncValues<HostSyncValues>(p.second, bank.GetBuffer()); + } + pending_sync.clear(); + } + + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional<u32> subreport_) override { + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = address; + new_query->value = 0; + new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; + if (has_timestamp) { + new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + if (!subreport_) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + const size_t subreport = static_cast<size_t>(*subreport_); + last_queries[subreport] = address; + if ((streams_mask & (1ULL << subreport)) == 0) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + CloseCounter(); + auto [bank_slot, data_slot] = ProduceCounterBuffer(subreport); + new_query->start_bank_id = static_cast<u32>(bank_slot); + new_query->size_banks = 1; + new_query->start_slot = static_cast<u32>(data_slot); + new_query->size_slots = 1; + pending_sync.push_back(index); + pending_flush_queries.push_back(index); + return index; + } + + std::optional<std::pair<VAddr, size_t>> GetLastQueryStream(size_t stream) { + if (last_queries[stream] != 0) { + std::pair<VAddr, size_t> result(last_queries[stream], last_queries_stride[stream]); + return result; + } + return std::nullopt; + } + + Maxwell3D::Regs::PrimitiveTopology GetOutputTopology() const { + return out_topology; + } + + bool HasUnsyncedQueries() const override { + return !pending_flush_queries.empty(); + } + + void PushUnsyncedQueries() override { + CloseCounter(); + auto staging_ref = staging_pool.Request( + pending_flush_queries.size() * TFBQueryBank::QUERY_SIZE, MemoryUsage::Download, true); + size_t offset_base = staging_ref.offset; + for (auto q : pending_flush_queries) { + auto* query = GetQuery(q); + auto& bank = bank_pool.GetBank(query->start_bank_id); + bank.Sync(staging_ref, offset_base, query->start_slot, 1); + offset_base += TFBQueryBank::QUERY_SIZE; + bank.CloseReference(); + } + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); + }); + + std::scoped_lock lk(flush_guard); + for (auto& str : free_queue) { + staging_pool.FreeDeferred(str); + } + free_queue.clear(); + download_buffers.emplace_back(staging_ref); + pending_flush_sets.emplace_back(std::move(pending_flush_queries)); + } + + void PopUnsyncedQueries() override { + StagingBufferRef staging_ref; + std::vector<size_t> flushed_queries; + { + std::scoped_lock lk(flush_guard); + staging_ref = download_buffers.front(); + flushed_queries = std::move(pending_flush_sets.front()); + download_buffers.pop_front(); + pending_flush_sets.pop_front(); + } + + size_t offset_base = staging_ref.offset; + for (auto q : flushed_queries) { + auto* query = GetQuery(q); + u32 result = 0; + std::memcpy(&result, staging_ref.mapped_span.data() + offset_base, sizeof(u32)); + query->value = static_cast<u64>(result); + query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + offset_base += TFBQueryBank::QUERY_SIZE; + } + + { + std::scoped_lock lk(flush_guard); + free_queue.emplace_back(staging_ref); + } + } + +private: + void FlushBeginTFB() { + if (has_flushed_end_pending) [[unlikely]] { + return; + } + has_flushed_end_pending = true; + if (!has_started || buffers_count == 0) { + scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); + }); + UpdateBuffers(); + return; + } + scheduler.Record([this, total = static_cast<u32>(buffers_count)](vk::CommandBuffer cmdbuf) { + cmdbuf.BeginTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); + }); + UpdateBuffers(); + } + + void FlushEndTFB() { + if (!has_flushed_end_pending) [[unlikely]] { + UNREACHABLE(); + return; + } + has_flushed_end_pending = false; + + if (buffers_count == 0) { + scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); + }); + } else { + scheduler.Record([this, + total = static_cast<u32>(buffers_count)](vk::CommandBuffer cmdbuf) { + cmdbuf.EndTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); + }); + } + } + + void UpdateBuffers() { + last_queries.fill(0); + last_queries_stride.fill(1); + runtime.View3DRegs([this](Maxwell3D& maxwell3d) { + buffers_count = 0; + out_topology = maxwell3d.draw_manager->GetDrawState().topology; + for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { + const auto& tf = maxwell3d.regs.transform_feedback; + if (tf.buffers[i].enable == 0) { + continue; + } + const size_t stream = tf.controls[i].stream; + last_queries_stride[stream] = tf.controls[i].stride; + streams_mask |= 1ULL << stream; + buffers_count = std::max<size_t>(buffers_count, stream + 1); + } + }); + } + + std::pair<size_t, size_t> ProduceCounterBuffer(size_t stream) { + if (current_bank == nullptr || current_bank->IsClosed()) { + current_bank_id = + bank_pool.ReserveBank([this](std::deque<TFBQueryBank>& queue, size_t index) { + queue.emplace_back(scheduler, memory_allocator, index); + }); + current_bank = &bank_pool.GetBank(current_bank_id); + } + auto [dont_care, other] = current_bank->Reserve(); + const size_t slot = other; // workaround to compile bug. + current_bank->AddReference(); + + static constexpr VkMemoryBarrier READ_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + }; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT, + }; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([dst_buffer = current_bank->GetBuffer(), + src_buffer = counter_buffers[stream], src_offset = offsets[stream], + slot](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); + std::array<VkBufferCopy, 1> copy{VkBufferCopy{ + .srcOffset = src_offset, + .dstOffset = slot * TFBQueryBank::QUERY_SIZE, + .size = TFBQueryBank::QUERY_SIZE, + }}; + cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, WRITE_BARRIER); + }); + return {current_bank_id, slot}; + } + + friend class PrimitivesSucceededStreamer; + + static constexpr size_t NUM_STREAMS = 4; + + QueryCacheRuntime& runtime; + const Device& device; + Scheduler& scheduler; + const MemoryAllocator& memory_allocator; + StagingBufferPool& staging_pool; + VideoCommon::BankPool<TFBQueryBank> bank_pool; + size_t current_bank_id; + TFBQueryBank* current_bank; + vk::Buffer counters_buffer; + + // syncing queue + std::vector<size_t> pending_sync; + + // flush levels + std::vector<size_t> pending_flush_queries; + std::deque<StagingBufferRef> download_buffers; + std::deque<std::vector<size_t>> pending_flush_sets; + std::vector<StagingBufferRef> free_queue; + std::mutex flush_guard; + + // state machine + bool has_started{}; + bool has_flushed_end_pending{}; + size_t buffers_count{}; + std::array<VkBuffer, NUM_STREAMS> counter_buffers{}; + std::array<VkDeviceSize, NUM_STREAMS> offsets{}; + std::array<VAddr, NUM_STREAMS> last_queries; + std::array<size_t, NUM_STREAMS> last_queries_stride; + Maxwell3D::Regs::PrimitiveTopology out_topology; + u64 streams_mask; +}; + +class PrimitivesQueryBase : public VideoCommon::QueryBase { +public: + // Default constructor + PrimitivesQueryBase() + : VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0) {} + + // Parameterized constructor + PrimitivesQueryBase(bool has_timestamp, VAddr address) + : VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0) { + if (has_timestamp) { + flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + } + + u64 stride{}; + VAddr dependant_address{}; + Maxwell3D::Regs::PrimitiveTopology topology{Maxwell3D::Regs::PrimitiveTopology::Points}; + size_t dependant_index{}; + bool dependant_manage{}; +}; + +class PrimitivesSucceededStreamer : public VideoCommon::SimpleStreamer<PrimitivesQueryBase> { +public: + explicit PrimitivesSucceededStreamer(size_t id_, QueryCacheRuntime& runtime_, + TFBCounterStreamer& tfb_streamer_, + Core::Memory::Memory& cpu_memory_) + : VideoCommon::SimpleStreamer<PrimitivesQueryBase>(id_), runtime{runtime_}, + tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} { + MakeDependent(&tfb_streamer); + } + + ~PrimitivesSucceededStreamer() = default; + + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional<u32> subreport_) override { + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = address; + new_query->value = 0; + if (has_timestamp) { + new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + if (!subreport_) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + const size_t subreport = static_cast<size_t>(*subreport_); + auto dependant_address_opt = tfb_streamer.GetLastQueryStream(subreport); + bool must_manage_dependance = false; + new_query->topology = tfb_streamer.GetOutputTopology(); + if (dependant_address_opt) { + auto [dep_address, stride] = *dependant_address_opt; + new_query->dependant_address = dep_address; + new_query->stride = stride; + } else { + new_query->dependant_index = + tfb_streamer.WriteCounter(address, has_timestamp, value, subreport_); + auto* dependant_query = tfb_streamer.GetQuery(new_query->dependant_index); + dependant_query->flags |= VideoCommon::QueryFlagBits::IsInvalidated; + must_manage_dependance = true; + if (True(dependant_query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { + new_query->value = 0; + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + if (must_manage_dependance) { + tfb_streamer.Free(new_query->dependant_index); + } + return index; + } + new_query->stride = 1; + runtime.View3DRegs([new_query, subreport](Maxwell3D& maxwell3d) { + for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { + const auto& tf = maxwell3d.regs.transform_feedback; + if (tf.buffers[i].enable == 0) { + continue; + } + if (tf.controls[i].stream != subreport) { + continue; + } + new_query->stride = tf.controls[i].stride; + break; + } + }); + } + + new_query->dependant_manage = must_manage_dependance; + pending_flush_queries.push_back(index); + return index; + } + + bool HasUnsyncedQueries() const override { + return !pending_flush_queries.empty(); + } + + void PushUnsyncedQueries() override { + std::scoped_lock lk(flush_guard); + pending_flush_sets.emplace_back(std::move(pending_flush_queries)); + pending_flush_queries.clear(); + } + + void PopUnsyncedQueries() override { + std::vector<size_t> flushed_queries; + { + std::scoped_lock lk(flush_guard); + flushed_queries = std::move(pending_flush_sets.front()); + pending_flush_sets.pop_front(); + } + + for (auto q : flushed_queries) { + auto* query = GetQuery(q); + if (True(query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { + continue; + } + + query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + u64 num_vertices = 0; + if (query->dependant_manage) { + auto* dependant_query = tfb_streamer.GetQuery(query->dependant_index); + num_vertices = dependant_query->value / query->stride; + tfb_streamer.Free(query->dependant_index); + } else { + u8* pointer = cpu_memory.GetPointer(query->dependant_address); + u32 result; + std::memcpy(&result, pointer, sizeof(u32)); + num_vertices = static_cast<u64>(result) / query->stride; + } + query->value = [&]() -> u64 { + switch (query->topology) { + case Maxwell3D::Regs::PrimitiveTopology::Points: + return num_vertices; + case Maxwell3D::Regs::PrimitiveTopology::Lines: + return num_vertices / 2; + case Maxwell3D::Regs::PrimitiveTopology::LineLoop: + return (num_vertices / 2) + 1; + case Maxwell3D::Regs::PrimitiveTopology::LineStrip: + return num_vertices - 1; + case Maxwell3D::Regs::PrimitiveTopology::Patches: + case Maxwell3D::Regs::PrimitiveTopology::Triangles: + case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: + return num_vertices / 3; + case Maxwell3D::Regs::PrimitiveTopology::TriangleFan: + case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: + case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: + return num_vertices - 2; + case Maxwell3D::Regs::PrimitiveTopology::Quads: + return num_vertices / 4; + case Maxwell3D::Regs::PrimitiveTopology::Polygon: + return 1U; + default: + return num_vertices; + } + }(); + } + } + +private: + QueryCacheRuntime& runtime; + TFBCounterStreamer& tfb_streamer; + Core::Memory::Memory& cpu_memory; + + // syncing queue + std::vector<size_t> pending_sync; + + // flush levels + std::vector<size_t> pending_flush_queries; + std::deque<std::vector<size_t>> pending_flush_sets; + std::mutex flush_guard; +}; + +} // namespace + +struct QueryCacheRuntimeImpl { + QueryCacheRuntimeImpl(QueryCacheRuntime& runtime, VideoCore::RasterizerInterface* rasterizer_, + Core::Memory::Memory& cpu_memory_, Vulkan::BufferCache& buffer_cache_, + const Device& device_, const MemoryAllocator& memory_allocator_, + Scheduler& scheduler_, StagingBufferPool& staging_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) + : rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, + buffer_cache{buffer_cache_}, device{device_}, + memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, + guest_streamer(0, runtime), + sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer, + device, scheduler, memory_allocator, compute_pass_descriptor_queue, + descriptor_pool), + tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device, + scheduler, memory_allocator, staging_pool), + primitives_succeeded_streamer( + static_cast<size_t>(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer, + cpu_memory_), + primitives_needed_minus_suceeded_streamer( + static_cast<size_t>(QueryType::StreamingPrimitivesNeededMinusSucceeded), runtime, 0u), + hcr_setup{}, hcr_is_set{}, is_hcr_running{}, maxwell3d{} { + + hcr_setup.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT; + hcr_setup.pNext = nullptr; + hcr_setup.flags = 0; + + conditional_resolve_pass = std::make_unique<ConditionalRenderingResolvePass>( + device, scheduler, descriptor_pool, compute_pass_descriptor_queue); + + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = sizeof(u32), + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + hcr_resolve_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + } + + VideoCore::RasterizerInterface* rasterizer; + Core::Memory::Memory& cpu_memory; + Vulkan::BufferCache& buffer_cache; + + const Device& device; + const MemoryAllocator& memory_allocator; + Scheduler& scheduler; + StagingBufferPool& staging_pool; + + // Streamers + VideoCommon::GuestStreamer<QueryCacheParams> guest_streamer; + SamplesStreamer sample_streamer; + TFBCounterStreamer tfb_streamer; + PrimitivesSucceededStreamer primitives_succeeded_streamer; + VideoCommon::StubStreamer<QueryCacheParams> primitives_needed_minus_suceeded_streamer; + + std::vector<std::pair<VAddr, VAddr>> little_cache; + std::vector<std::pair<VkBuffer, VkDeviceSize>> buffers_to_upload_to; + std::vector<size_t> redirect_cache; + std::vector<std::vector<VkBufferCopy>> copies_setup; + + // Host conditional rendering data + std::unique_ptr<ConditionalRenderingResolvePass> conditional_resolve_pass; + vk::Buffer hcr_resolve_buffer; + VkConditionalRenderingBeginInfoEXT hcr_setup; + VkBuffer hcr_buffer; + size_t hcr_offset; + bool hcr_is_set; + bool is_hcr_running; + + // maxwell3d + Maxwell3D* maxwell3d; +}; + +QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, + Core::Memory::Memory& cpu_memory_, + Vulkan::BufferCache& buffer_cache_, const Device& device_, + const MemoryAllocator& memory_allocator_, + Scheduler& scheduler_, StagingBufferPool& staging_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) { + impl = std::make_unique<QueryCacheRuntimeImpl>( + *this, rasterizer, cpu_memory_, buffer_cache_, device_, memory_allocator_, scheduler_, + staging_pool_, compute_pass_descriptor_queue, descriptor_pool); } -void QueryPool::Allocate(std::size_t begin, std::size_t end) { - usage.resize(end); +void QueryCacheRuntime::Bind3DEngine(Maxwell3D* maxwell3d) { + impl->maxwell3d = maxwell3d; +} - pools.push_back(device.GetLogical().CreateQueryPool({ - .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .queryType = GetTarget(type), - .queryCount = static_cast<u32>(end - begin), - .pipelineStatistics = 0, - })); +template <typename Func> +void QueryCacheRuntime::View3DRegs(Func&& func) { + if (impl->maxwell3d) { + func(*impl->maxwell3d); + } +} + +void QueryCacheRuntime::EndHostConditionalRendering() { + PauseHostConditionalRendering(); + impl->hcr_is_set = false; + impl->is_hcr_running = false; + impl->hcr_buffer = nullptr; + impl->hcr_offset = 0; +} + +void QueryCacheRuntime::PauseHostConditionalRendering() { + if (!impl->hcr_is_set) { + return; + } + if (impl->is_hcr_running) { + impl->scheduler.Record( + [](vk::CommandBuffer cmdbuf) { cmdbuf.EndConditionalRenderingEXT(); }); + } + impl->is_hcr_running = false; } -void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { - const auto it = - std::find_if(pools.begin(), pools.end(), [query_pool = query.first](vk::QueryPool& pool) { - return query_pool == *pool; +void QueryCacheRuntime::ResumeHostConditionalRendering() { + if (!impl->hcr_is_set) { + return; + } + if (!impl->is_hcr_running) { + impl->scheduler.Record([hcr_setup = impl->hcr_setup](vk::CommandBuffer cmdbuf) { + cmdbuf.BeginConditionalRenderingEXT(hcr_setup); }); + } + impl->is_hcr_running = true; +} - if (it != std::end(pools)) { - const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); - usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; +void QueryCacheRuntime::HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, + bool is_equal) { + { + std::scoped_lock lk(impl->buffer_cache.mutex); + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = + impl->buffer_cache.ObtainCPUBuffer(object.address, 8, sync_info, post_op); + impl->hcr_buffer = buffer->Handle(); + impl->hcr_offset = offset; + } + if (impl->hcr_is_set) { + if (impl->hcr_setup.buffer == impl->hcr_buffer && + impl->hcr_setup.offset == impl->hcr_offset) { + ResumeHostConditionalRendering(); + return; + } + PauseHostConditionalRendering(); } + impl->hcr_setup.buffer = impl->hcr_buffer; + impl->hcr_setup.offset = impl->hcr_offset; + impl->hcr_setup.flags = is_equal ? VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT : 0; + impl->hcr_is_set = true; + impl->is_hcr_running = false; + ResumeHostConditionalRendering(); } -QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, const Device& device_, - Scheduler& scheduler_) - : QueryCacheBase{rasterizer_, cpu_memory_}, device{device_}, scheduler{scheduler_}, - query_pools{ - QueryPool{device_, scheduler_, QueryType::SamplesPassed}, - } {} - -QueryCache::~QueryCache() { - // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class - // destructor is called. The query cache should be redesigned to have a proper ownership model - // instead of using shared pointers. - for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) { - auto& stream = Stream(static_cast<QueryType>(query_type)); - stream.Update(false); - stream.Reset(); +void QueryCacheRuntime::HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal) { + VkBuffer to_resolve; + u32 to_resolve_offset; + { + std::scoped_lock lk(impl->buffer_cache.mutex); + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::NoSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = + impl->buffer_cache.ObtainCPUBuffer(address, 24, sync_info, post_op); + to_resolve = buffer->Handle(); + to_resolve_offset = static_cast<u32>(offset); } + if (impl->is_hcr_running) { + PauseHostConditionalRendering(); + } + impl->conditional_resolve_pass->Resolve(*impl->hcr_resolve_buffer, to_resolve, + to_resolve_offset, false); + impl->hcr_setup.buffer = *impl->hcr_resolve_buffer; + impl->hcr_setup.offset = 0; + impl->hcr_setup.flags = is_equal ? 0 : VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; + impl->hcr_is_set = true; + impl->is_hcr_running = false; + ResumeHostConditionalRendering(); } -std::pair<VkQueryPool, u32> QueryCache::AllocateQuery(QueryType type) { - return query_pools[static_cast<std::size_t>(type)].Commit(); +bool QueryCacheRuntime::HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, + [[maybe_unused]] bool qc_dirty) { + if (!impl->device.IsExtConditionalRendering()) { + return false; + } + HostConditionalRenderingCompareValueImpl(object_1, false); + return true; } -void QueryCache::Reserve(QueryType type, std::pair<VkQueryPool, u32> query) { - query_pools[static_cast<std::size_t>(type)].Reserve(query); +bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, + VideoCommon::LookupData object_2, + bool qc_dirty, bool equal_check) { + if (!impl->device.IsExtConditionalRendering()) { + return false; + } + + const auto check_in_bc = [&](VAddr address) { + return impl->buffer_cache.IsRegionGpuModified(address, 8); + }; + const auto check_value = [&](VAddr address) { + u8* ptr = impl->cpu_memory.GetPointer(address); + u64 value{}; + std::memcpy(&value, ptr, sizeof(value)); + return value == 0; + }; + std::array<VideoCommon::LookupData*, 2> objects{&object_1, &object_2}; + std::array<bool, 2> is_in_bc{}; + std::array<bool, 2> is_in_qc{}; + std::array<bool, 2> is_in_ac{}; + std::array<bool, 2> is_null{}; + { + std::scoped_lock lk(impl->buffer_cache.mutex); + for (size_t i = 0; i < 2; i++) { + is_in_qc[i] = objects[i]->found_query != nullptr; + is_in_bc[i] = !is_in_qc[i] && check_in_bc(objects[i]->address); + is_in_ac[i] = is_in_qc[i] || is_in_bc[i]; + } + } + + if (!is_in_ac[0] && !is_in_ac[1]) { + EndHostConditionalRendering(); + return false; + } + + if (!qc_dirty && !is_in_bc[0] && !is_in_bc[1]) { + EndHostConditionalRendering(); + return false; + } + + const bool is_gpu_high = Settings::IsGPULevelHigh(); + if (!is_gpu_high && impl->device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { + return true; + } + + for (size_t i = 0; i < 2; i++) { + is_null[i] = !is_in_ac[i] && check_value(objects[i]->address); + } + + for (size_t i = 0; i < 2; i++) { + if (is_null[i]) { + size_t j = (i + 1) % 2; + HostConditionalRenderingCompareValueImpl(*objects[j], equal_check); + return true; + } + } + + if (!is_gpu_high) { + return true; + } + + if (!is_in_bc[0] && !is_in_bc[1]) { + // Both queries are in query cache, it's best to just flush. + return true; + } + HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); + return true; } -HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, - QueryType type_) - : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, - query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { - const vk::Device* logical = &cache.GetDevice().GetLogical(); - cache.GetScheduler().Record([logical, query_ = query](vk::CommandBuffer cmdbuf) { - const bool use_precise = Settings::IsGPULevelHigh(); - logical->ResetQueryPool(query_.first, query_.second, 1); - cmdbuf.BeginQuery(query_.first, query_.second, - use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); - }); +QueryCacheRuntime::~QueryCacheRuntime() = default; + +VideoCommon::StreamerInterface* QueryCacheRuntime::GetStreamerInterface(QueryType query_type) { + switch (query_type) { + case QueryType::Payload: + return &impl->guest_streamer; + case QueryType::ZPassPixelCount64: + return &impl->sample_streamer; + case QueryType::StreamingByteCount: + return &impl->tfb_streamer; + case QueryType::StreamingPrimitivesNeeded: + case QueryType::VtgPrimitivesOut: + case QueryType::StreamingPrimitivesSucceeded: + return &impl->primitives_succeeded_streamer; + case QueryType::StreamingPrimitivesNeededMinusSucceeded: + return &impl->primitives_needed_minus_suceeded_streamer; + default: + return nullptr; + } } -HostCounter::~HostCounter() { - cache.Reserve(type, query); +void QueryCacheRuntime::Barriers(bool is_prebarrier) { + static constexpr VkMemoryBarrier READ_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + if (is_prebarrier) { + impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); + }); + } else { + impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); + }); + } } -void HostCounter::EndQuery() { - cache.GetScheduler().Record([query_ = query](vk::CommandBuffer cmdbuf) { - cmdbuf.EndQuery(query_.first, query_.second); +template <typename SyncValuesType> +void QueryCacheRuntime::SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer) { + if (values.size() == 0) { + return; + } + impl->redirect_cache.clear(); + impl->little_cache.clear(); + size_t total_size = 0; + for (auto& sync_val : values) { + total_size += sync_val.size; + bool found = false; + VAddr base = Common::AlignDown(sync_val.address, Core::Memory::YUZU_PAGESIZE); + VAddr base_end = base + Core::Memory::YUZU_PAGESIZE; + for (size_t i = 0; i < impl->little_cache.size(); i++) { + const auto set_found = [&] { + impl->redirect_cache.push_back(i); + found = true; + }; + auto& loc = impl->little_cache[i]; + if (base < loc.second && loc.first < base_end) { + set_found(); + break; + } + if (loc.first == base_end) { + loc.first = base; + set_found(); + break; + } + if (loc.second == base) { + loc.second = base_end; + set_found(); + break; + } + } + if (!found) { + impl->redirect_cache.push_back(impl->little_cache.size()); + impl->little_cache.emplace_back(base, base_end); + } + } + + // Vulkan part. + std::scoped_lock lk(impl->buffer_cache.mutex); + impl->buffer_cache.BufferOperations([&] { + impl->buffers_to_upload_to.clear(); + for (auto& pair : impl->little_cache) { + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = impl->buffer_cache.ObtainCPUBuffer( + pair.first, static_cast<u32>(pair.second - pair.first), sync_info, post_op); + impl->buffers_to_upload_to.emplace_back(buffer->Handle(), offset); + } }); -} -u64 HostCounter::BlockingQuery(bool async) const { - if (!async) { - cache.GetScheduler().Wait(tick); - } - u64 data; - const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( - query.first, query.second, 1, sizeof(data), &data, sizeof(data), - VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); - - switch (query_result) { - case VK_SUCCESS: - return data; - case VK_ERROR_DEVICE_LOST: - cache.GetDevice().ReportLoss(); - [[fallthrough]]; - default: - throw vk::Exception(query_result); + VkBuffer src_buffer; + [[maybe_unused]] StagingBufferRef ref; + impl->copies_setup.clear(); + impl->copies_setup.resize(impl->little_cache.size()); + if constexpr (SyncValuesType::GeneratesBaseBuffer) { + ref = impl->staging_pool.Request(total_size, MemoryUsage::Upload); + size_t current_offset = ref.offset; + size_t accumulated_size = 0; + for (size_t i = 0; i < values.size(); i++) { + size_t which_copy = impl->redirect_cache[i]; + impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ + .srcOffset = current_offset + accumulated_size, + .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - + impl->little_cache[which_copy].first, + .size = values[i].size, + }); + std::memcpy(ref.mapped_span.data() + accumulated_size, &values[i].value, + values[i].size); + accumulated_size += values[i].size; + } + src_buffer = ref.buffer; + } else { + for (size_t i = 0; i < values.size(); i++) { + size_t which_copy = impl->redirect_cache[i]; + impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ + .srcOffset = values[i].offset, + .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - + impl->little_cache[which_copy].first, + .size = values[i].size, + }); + } + src_buffer = base_src_buffer; } + + impl->scheduler.RequestOutsideRenderPassOperationContext(); + impl->scheduler.Record([src_buffer, dst_buffers = std::move(impl->buffers_to_upload_to), + vk_copies = std::move(impl->copies_setup)](vk::CommandBuffer cmdbuf) { + size_t size = dst_buffers.size(); + for (size_t i = 0; i < size; i++) { + cmdbuf.CopyBuffer(src_buffer, dst_buffers[i].first, vk_copies[i]); + } + }); } } // namespace Vulkan + +namespace VideoCommon { + +template class QueryCacheBase<Vulkan::QueryCacheParams>; + +} // namespace VideoCommon diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index c1b9552eb..e9a1ea169 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -1,101 +1,75 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once -#include <cstddef> #include <memory> -#include <utility> -#include <vector> -#include "common/common_types.h" -#include "video_core/query_cache.h" -#include "video_core/renderer_vulkan/vk_resource_pool.h" -#include "video_core/vulkan_common/vulkan_wrapper.h" +#include "video_core/query_cache/query_cache_base.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" namespace VideoCore { class RasterizerInterface; } +namespace VideoCommon { +class StreamerInterface; +} + namespace Vulkan { -class CachedQuery; class Device; -class HostCounter; -class QueryCache; class Scheduler; +class StagingBufferPool; -using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; +struct QueryCacheRuntimeImpl; -class QueryPool final : public ResourcePool { +class QueryCacheRuntime { public: - explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type); - ~QueryPool() override; + explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, + Core::Memory::Memory& cpu_memory_, + Vulkan::BufferCache& buffer_cache_, const Device& device_, + const MemoryAllocator& memory_allocator_, Scheduler& scheduler_, + StagingBufferPool& staging_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool); + ~QueryCacheRuntime(); - std::pair<VkQueryPool, u32> Commit(); + template <typename SyncValuesType> + void SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer = nullptr); - void Reserve(std::pair<VkQueryPool, u32> query); + void Barriers(bool is_prebarrier); -protected: - void Allocate(std::size_t begin, std::size_t end) override; + void EndHostConditionalRendering(); -private: - static constexpr std::size_t GROW_STEP = 512; + void PauseHostConditionalRendering(); - const Device& device; - const VideoCore::QueryType type; + void ResumeHostConditionalRendering(); - std::vector<vk::QueryPool> pools; - std::vector<bool> usage; -}; + bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty); -class QueryCache final - : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { -public: - explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, const Device& device_, - Scheduler& scheduler_); - ~QueryCache(); - - std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type); + bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, + VideoCommon::LookupData object_2, bool qc_dirty, + bool equal_check); - void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query); + VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type); - const Device& GetDevice() const noexcept { - return device; - } + void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d); - Scheduler& GetScheduler() const noexcept { - return scheduler; - } + template <typename Func> + void View3DRegs(Func&& func); private: - const Device& device; - Scheduler& scheduler; - std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; + void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal); + void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal); + friend struct QueryCacheRuntimeImpl; + std::unique_ptr<QueryCacheRuntimeImpl> impl; }; -class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { -public: - explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, - VideoCore::QueryType type_); - ~HostCounter(); - - void EndQuery(); - -private: - u64 BlockingQuery(bool async = false) const override; - - QueryCache& cache; - const VideoCore::QueryType type; - const std::pair<VkQueryPool, u32> query; - const u64 tick; +struct QueryCacheParams { + using RuntimeType = typename Vulkan::QueryCacheRuntime; }; -class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { -public: - explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_) - : CachedQueryBase{cpu_addr_, host_ptr_} {} -}; +using QueryCache = VideoCommon::QueryCacheBase<QueryCacheParams>; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 456bb040e..83f2b6045 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -6,6 +6,8 @@ #include <memory> #include <mutex> +#include "video_core/renderer_vulkan/renderer_vulkan.h" + #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" @@ -18,11 +20,11 @@ #include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -169,9 +171,11 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), buffer_cache(*this, cpu_memory_, buffer_cache_runtime), + query_cache_runtime(this, cpu_memory_, buffer_cache, device, memory_allocator, scheduler, + staging_pool, compute_pass_descriptor_queue, descriptor_pool), + query_cache(gpu, *this, cpu_memory_, query_cache_runtime), pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), - query_cache{*this, cpu_memory_, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), wfi_event(device.GetLogical().CreateEvent()) { @@ -188,14 +192,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { FlushWork(); gpu_memory->FlushCaching(); -#if ANDROID - if (Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - query_cache.UpdateCounters(); - } -#else - query_cache.UpdateCounters(); -#endif + query_cache.NotifySegment(true); GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; if (!pipeline) { @@ -206,13 +203,12 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); - BeginTransformFeedback(); - UpdateDynamicStates(); + HandleTransformFeedback(); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, + maxwell3d->regs.zpass_pixel_count_enable); draw_func(); - - EndTransformFeedback(); } void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { @@ -240,6 +236,14 @@ void RasterizerVulkan::DrawIndirect() { const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); const auto& buffer = indirect_buffer.first; const auto& offset = indirect_buffer.second; + if (params.is_byte_count) { + scheduler.Record([buffer_obj = buffer->Handle(), offset, + stride = params.stride](vk::CommandBuffer cmdbuf) { + cmdbuf.DrawIndirectByteCountEXT(1, 0, buffer_obj, offset, 0, + static_cast<u32>(stride)); + }); + return; + } if (params.include_count) { const auto count = buffer_cache.GetDrawIndirectCount(); const auto& draw_buffer = count.first; @@ -279,20 +283,15 @@ void RasterizerVulkan::DrawTexture() { SCOPE_EXIT({ gpu.TickWork(); }); FlushWork(); -#if ANDROID - if (Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - query_cache.UpdateCounters(); - } -#else - query_cache.UpdateCounters(); -#endif + query_cache.NotifySegment(true); texture_cache.SynchronizeGraphicsDescriptors(); texture_cache.UpdateRenderTargets(false); UpdateDynamicStates(); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, + maxwell3d->regs.zpass_pixel_count_enable); const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); @@ -315,14 +314,9 @@ void RasterizerVulkan::Clear(u32 layer_count) { FlushWork(); gpu_memory->FlushCaching(); -#if ANDROID - if (Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - query_cache.UpdateCounters(); - } -#else - query_cache.UpdateCounters(); -#endif + query_cache.NotifySegment(true); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, + maxwell3d->regs.zpass_pixel_count_enable); auto& regs = maxwell3d->regs; const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || @@ -427,15 +421,28 @@ void RasterizerVulkan::Clear(u32 layer_count) { if (aspect_flags == 0) { return; } - scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, - clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) { - VkClearAttachment attachment; - attachment.aspectMask = aspect_flags; - attachment.colorAttachment = 0; - attachment.clearValue.depthStencil.depth = clear_depth; - attachment.clearValue.depthStencil.stencil = clear_stencil; - cmdbuf.ClearAttachments(attachment, clear_rect); - }); + + if (use_stencil && framebuffer->HasAspectStencilBit() && regs.stencil_front_mask != 0xFF && + regs.stencil_front_mask != 0) { + Region2D dst_region = { + Offset2D{.x = clear_rect.rect.offset.x, .y = clear_rect.rect.offset.y}, + Offset2D{.x = clear_rect.rect.offset.x + static_cast<s32>(clear_rect.rect.extent.width), + .y = clear_rect.rect.offset.y + + static_cast<s32>(clear_rect.rect.extent.height)}}; + blit_image.ClearDepthStencil(framebuffer, use_depth, regs.clear_depth, + static_cast<u8>(regs.stencil_front_mask), regs.clear_stencil, + regs.stencil_front_func_mask, dst_region); + } else { + scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, + clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) { + VkClearAttachment attachment; + attachment.aspectMask = aspect_flags; + attachment.colorAttachment = 0; + attachment.clearValue.depthStencil.depth = clear_depth; + attachment.clearValue.depthStencil.stencil = clear_stencil; + cmdbuf.ClearAttachments(attachment, clear_rect); + }); + } } void RasterizerVulkan::DispatchCompute() { @@ -450,18 +457,32 @@ void RasterizerVulkan::DispatchCompute() { pipeline->Configure(*kepler_compute, *gpu_memory, scheduler, buffer_cache, texture_cache); const auto& qmd{kepler_compute->launch_description}; + auto indirect_address = kepler_compute->GetIndirectComputeAddress(); + if (indirect_address) { + // DispatchIndirect + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DiscardWrite; + const auto [buffer, offset] = + buffer_cache.ObtainBuffer(*indirect_address, 12, sync_info, post_op); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([indirect_buffer = buffer->Handle(), + indirect_offset = offset](vk::CommandBuffer cmdbuf) { + cmdbuf.DispatchIndirect(indirect_buffer, indirect_offset); + }); + return; + } const std::array<u32, 3> dim{qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z}; scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); } -void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { - query_cache.ResetCounter(type); +void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) { + query_cache.CounterReset(type); } -void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, - std::optional<u64> timestamp) { - query_cache.Query(gpu_addr, type, timestamp); +void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { + query_cache.CounterReport(gpu_addr, type, flags, payload, subreport); } void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -642,8 +663,8 @@ void RasterizerVulkan::SignalReference() { fence_manager.SignalReference(); } -void RasterizerVulkan::ReleaseFences() { - fence_manager.WaitPendingFences(); +void RasterizerVulkan::ReleaseFences(bool force) { + fence_manager.WaitPendingFences(force); } void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, @@ -667,6 +688,8 @@ void RasterizerVulkan::WaitForIdle() { flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; } + query_cache.NotifyWFI(); + scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { cmdbuf.SetEvent(event, flags); @@ -710,19 +733,7 @@ void RasterizerVulkan::TickFrame() { bool RasterizerVulkan::AccelerateConditionalRendering() { gpu_memory->FlushCaching(); - if (Settings::IsGPULevelHigh()) { - // TODO(Blinkhawk): Reimplement Host conditional rendering. - return false; - } - // Medium / Low Hack: stub any checks on queries written into the buffer cache. - const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; - Maxwell::ReportSemaphore::Compare cmp; - if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), - VideoCommon::CacheType::BufferCache | - VideoCommon::CacheType::QueryCache)) { - return true; - } - return false; + return query_cache.AccelerateHostConditionalRendering(); } bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, @@ -768,6 +779,7 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, if (!image_view) { return false; } + query_cache.NotifySegment(false); screen_info.image = image_view->ImageHandle(); screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); screen_info.width = image_view->size.width; @@ -829,7 +841,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, } const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; - const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing + : VideoCommon::ObtainBufferOperation::MarkAsWritten; const auto [buffer, offset] = buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op); @@ -838,8 +851,12 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, const std::span copy_span{©, 1}; if constexpr (IS_IMAGE_UPLOAD) { + texture_cache.PrepareImage(image_id, true, false); image->UploadMemory(buffer->Handle(), offset, copy_span); } else { + if (offset % BytesPerBlock(image->info.format)) { + return false; + } texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span, buffer_operand.address, buffer_size); } @@ -901,31 +918,18 @@ void RasterizerVulkan::UpdateDynamicStates() { } } -void RasterizerVulkan::BeginTransformFeedback() { +void RasterizerVulkan::HandleTransformFeedback() { const auto& regs = maxwell3d->regs; - if (regs.transform_feedback_enabled == 0) { - return; - } if (!device.IsExtTransformFeedbackSupported()) { LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); return; } - UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || - regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); - scheduler.Record( - [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); -} - -void RasterizerVulkan::EndTransformFeedback() { - const auto& regs = maxwell3d->regs; - if (regs.transform_feedback_enabled == 0) { - return; - } - if (!device.IsExtTransformFeedbackSupported()) { - return; + query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount, + regs.transform_feedback_enabled); + if (regs.transform_feedback_enabled != 0) { + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || + regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); } - scheduler.Record( - [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { @@ -1011,15 +1015,37 @@ void RasterizerVulkan::UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs) { regs.zeta.format == Tegra::DepthFormat::X8Z24_UNORM || regs.zeta.format == Tegra::DepthFormat::S8Z24_UNORM || regs.zeta.format == Tegra::DepthFormat::V8Z24_UNORM; - if (is_d24 && !device.SupportsD24DepthBuffer()) { + bool force_unorm = ([&] { + if (!is_d24 || device.SupportsD24DepthBuffer()) { + return false; + } + if (device.IsExtDepthBiasControlSupported()) { + return true; + } + if (!Settings::values.renderer_amdvlk_depth_bias_workaround) { + return false; + } // the base formulas can be obtained from here: // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d10-graphics-programming-guide-output-merger-stage-depth-bias const double rescale_factor = static_cast<double>(1ULL << (32 - 24)) / (static_cast<double>(0x1.ep+127)); units = static_cast<float>(static_cast<double>(units) * rescale_factor); - } + return false; + })(); scheduler.Record([constant = units, clamp = regs.depth_bias_clamp, - factor = regs.slope_scale_depth_bias](vk::CommandBuffer cmdbuf) { + factor = regs.slope_scale_depth_bias, force_unorm, + precise = device.HasExactDepthBiasControl()](vk::CommandBuffer cmdbuf) { + if (force_unorm) { + VkDepthBiasRepresentationInfoEXT info{ + .sType = VK_STRUCTURE_TYPE_DEPTH_BIAS_REPRESENTATION_INFO_EXT, + .pNext = nullptr, + .depthBiasRepresentation = + VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT, + .depthBiasExact = precise ? VK_TRUE : VK_FALSE, + }; + cmdbuf.SetDepthBias(constant, clamp, factor, &info); + return; + } cmdbuf.SetDepthBias(constant, clamp, factor); }); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 73257d964..ad069556c 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -7,13 +7,14 @@ #include <boost/container/static_vector.hpp> +#include "video_core/renderer_vulkan/vk_buffer_cache.h" + #include "common/common_types.h" #include "video_core/control/channel_state_cache.h" #include "video_core/engines/maxwell_dma.h" #include "video_core/rasterizer_accelerated.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_vulkan/blit_image.h" -#include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_fence_manager.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" @@ -83,8 +84,9 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCore::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; + void ResetCounter(VideoCommon::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -105,7 +107,7 @@ public: void SyncOperation(std::function<void()>&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences() override; + void ReleaseFences(bool force = true) override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; @@ -145,9 +147,7 @@ private: void UpdateDynamicStates(); - void BeginTransformFeedback(); - - void EndTransformFeedback(); + void HandleTransformFeedback(); void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); @@ -194,8 +194,9 @@ private: TextureCache texture_cache; BufferCacheRuntime buffer_cache_runtime; BufferCache buffer_cache; - PipelineCache pipeline_cache; + QueryCacheRuntime query_cache_runtime; QueryCache query_cache; + PipelineCache pipeline_cache; AccelerateDMA accelerate_dma; FenceManager fence_manager; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 17ef61147..3be7837f4 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -6,11 +6,12 @@ #include <thread> #include <utility> +#include "video_core/renderer_vulkan/vk_query_cache.h" + #include "common/microprofile.h" #include "common/thread.h" #include "video_core/renderer_vulkan/vk_command_pool.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" -#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" @@ -242,10 +243,10 @@ void Scheduler::AllocateNewContext() { #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache->UpdateCounters(); + query_cache->NotifySegment(true); } #else - query_cache->UpdateCounters(); + query_cache->NotifySegment(true); #endif } } @@ -260,11 +261,12 @@ void Scheduler::EndPendingOperations() { #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache->DisableStreams(); + // query_cache->DisableStreams(); } #else - query_cache->DisableStreams(); + // query_cache->DisableStreams(); #endif + query_cache->NotifySegment(false); EndRenderPass(); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 475c682eb..da03803aa 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -17,6 +17,11 @@ #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +namespace VideoCommon { +template <typename Trait> +class QueryCacheBase; +} + namespace Vulkan { class CommandPool; @@ -24,7 +29,8 @@ class Device; class Framebuffer; class GraphicsPipeline; class StateTracker; -class QueryCache; + +struct QueryCacheParams; /// The scheduler abstracts command buffer and fence management with an interface that's able to do /// OpenGL-like operations on Vulkan command buffers. @@ -63,7 +69,7 @@ public: void InvalidateState(); /// Assigns the query cache. - void SetQueryCache(QueryCache& query_cache_) { + void SetQueryCache(VideoCommon::QueryCacheBase<QueryCacheParams>& query_cache_) { query_cache = &query_cache_; } @@ -219,7 +225,7 @@ private: std::unique_ptr<MasterSemaphore> master_semaphore; std::unique_ptr<CommandPool> command_pool; - QueryCache* query_cache = nullptr; + VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr; vk::CommandBuffer current_cmdbuf; diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index ce92f66ab..b278614e6 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -24,25 +24,38 @@ using namespace Common::Literals; // Maximum potential alignment of a Vulkan buffer constexpr VkDeviceSize MAX_ALIGNMENT = 256; -// Maximum size to put elements in the stream buffer -constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8_MiB; // Stream buffer size in bytes -constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128_MiB; -constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS; +constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB; -size_t Region(size_t iterator) noexcept { - return iterator / REGION_SIZE; +size_t GetStreamBufferSize(const Device& device) { + VkDeviceSize size{0}; + if (device.HasDebuggingToolAttached()) { + ForEachDeviceLocalHostVisibleHeap(device, [&size](size_t index, VkMemoryHeap& heap) { + size = std::max(size, heap.size); + }); + // If rebar is not supported, cut the max heap size to 40%. This will allow 2 captures to be + // loaded at the same time in RenderDoc. If rebar is supported, this shouldn't be an issue + // as the heap will be much larger. + if (size <= 256_MiB) { + size = size * 40 / 100; + } + } else { + size = MAX_STREAM_BUFFER_SIZE; + } + return std::min(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE); } } // Anonymous namespace StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, Scheduler& scheduler_) - : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { + : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, + stream_buffer_size{GetStreamBufferSize(device)}, region_size{stream_buffer_size / + StagingBufferPool::NUM_SYNCS} { VkBufferCreateInfo stream_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = STREAM_BUFFER_SIZE, + .size = stream_buffer_size, .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, @@ -63,7 +76,7 @@ StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& mem StagingBufferPool::~StagingBufferPool() = default; StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage, bool deferred) { - if (!deferred && usage == MemoryUsage::Upload && size <= MAX_STREAM_BUFFER_REQUEST_SIZE) { + if (!deferred && usage == MemoryUsage::Upload && size <= region_size) { return GetStreamBuffer(size); } return GetStagingBuffer(size, usage, deferred); @@ -101,7 +114,7 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { used_iterator = iterator; free_iterator = std::max(free_iterator, iterator + size); - if (iterator + size >= STREAM_BUFFER_SIZE) { + if (iterator + size >= stream_buffer_size) { std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS, current_tick); used_iterator = 0; diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 5f69f08b1..d3deb9072 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -90,6 +90,9 @@ private: void ReleaseCache(MemoryUsage usage); void ReleaseLevel(StagingBuffersCache& cache, size_t log2); + size_t Region(size_t iter) const noexcept { + return iter / region_size; + } const Device& device; MemoryAllocator& memory_allocator; @@ -97,6 +100,8 @@ private: vk::Buffer stream_buffer; std::span<u8> stream_pointer; + VkDeviceSize stream_buffer_size; + VkDeviceSize region_size; size_t iterator = 0; size_t used_iterator = 0; diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index d3cddac69..81ef98f61 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -45,8 +45,8 @@ static VkPresentModeKHR ChooseSwapPresentMode(bool has_imm, bool has_mailbox, return mode; } switch (mode) { - case Settings::VSyncMode::FIFO: - case Settings::VSyncMode::FIFORelaxed: + case Settings::VSyncMode::Fifo: + case Settings::VSyncMode::FifoRelaxed: if (has_mailbox) { return Settings::VSyncMode::Mailbox; } else if (has_imm) { @@ -59,8 +59,8 @@ static VkPresentModeKHR ChooseSwapPresentMode(bool has_imm, bool has_mailbox, }(); if ((setting == Settings::VSyncMode::Mailbox && !has_mailbox) || (setting == Settings::VSyncMode::Immediate && !has_imm) || - (setting == Settings::VSyncMode::FIFORelaxed && !has_fifo_relaxed)) { - setting = Settings::VSyncMode::FIFO; + (setting == Settings::VSyncMode::FifoRelaxed && !has_fifo_relaxed)) { + setting = Settings::VSyncMode::Fifo; } switch (setting) { @@ -68,9 +68,9 @@ static VkPresentModeKHR ChooseSwapPresentMode(bool has_imm, bool has_mailbox, return VK_PRESENT_MODE_IMMEDIATE_KHR; case Settings::VSyncMode::Mailbox: return VK_PRESENT_MODE_MAILBOX_KHR; - case Settings::VSyncMode::FIFO: + case Settings::VSyncMode::Fifo: return VK_PRESENT_MODE_FIFO_KHR; - case Settings::VSyncMode::FIFORelaxed: + case Settings::VSyncMode::FifoRelaxed: return VK_PRESENT_MODE_FIFO_RELAXED_KHR; default: return VK_PRESENT_MODE_FIFO_KHR; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index bf6ad6c79..00ab47268 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -11,6 +11,8 @@ #include "common/bit_util.h" #include "common/settings.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" + #include "video_core/engines/fermi_2d.h" #include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" @@ -18,7 +20,6 @@ #include "video_core/renderer_vulkan/vk_render_pass_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" -#include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/util.h" @@ -119,19 +120,9 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { return usage; } -/// Returns the preferred format for a VkImage -[[nodiscard]] PixelFormat StorageFormat(PixelFormat format) { - switch (format) { - case PixelFormat::A8B8G8R8_SRGB: - return PixelFormat::A8B8G8R8_UNORM; - default: - return format; - } -} - [[nodiscard]] VkImageCreateInfo MakeImageCreateInfo(const Device& device, const ImageInfo& info) { - const PixelFormat format = StorageFormat(info.format); - const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); + const auto format_info = + MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, info.format); VkImageCreateFlags flags{}; if (info.type == ImageType::e2D && info.resources.layers >= 6 && info.size.width == info.size.height && !device.HasBrokenCubeImageCompability()) { @@ -156,7 +147,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { .arrayLayers = static_cast<u32>(info.resources.layers), .samples = ConvertSampleCount(info.num_samples), .tiling = VK_IMAGE_TILING_OPTIMAL, - .usage = ImageUsageFlags(format_info, format), + .usage = ImageUsageFlags(format_info, info.format), .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, @@ -185,6 +176,36 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { return allocator.CreateImage(image_ci); } +[[nodiscard]] vk::ImageView MakeStorageView(const vk::Device& device, u32 level, VkImage image, + VkFormat format) { + static constexpr VkImageViewUsageCreateInfo storage_image_view_usage_create_info{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, + .pNext = nullptr, + .usage = VK_IMAGE_USAGE_STORAGE_BIT, + }; + return device.CreateImageView(VkImageViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = &storage_image_view_usage_create_info, + .flags = 0, + .image = image, + .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY, + .format = format, + .components{ + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = level, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }); +} + [[nodiscard]] VkImageAspectFlags ImageAspectMask(PixelFormat format) { switch (VideoCore::Surface::GetFormatType(format)) { case VideoCore::Surface::SurfaceType::ColorTexture: @@ -217,6 +238,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { return any_r ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; case PixelFormat::D16_UNORM: case PixelFormat::D32_FLOAT: + case PixelFormat::X8_D24_UNORM: return VK_IMAGE_ASPECT_DEPTH_BIT; case PixelFormat::S8_UINT: return VK_IMAGE_ASPECT_STENCIL_BIT; @@ -599,7 +621,7 @@ void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage im } void TryTransformSwizzleIfNeeded(PixelFormat format, std::array<SwizzleSource, 4>& swizzle, - bool emulate_bgr565) { + bool emulate_bgr565, bool emulate_a4b4g4r4) { switch (format) { case PixelFormat::A1B5G5R5_UNORM: std::ranges::transform(swizzle, swizzle.begin(), SwapBlueRed); @@ -615,6 +637,11 @@ void TryTransformSwizzleIfNeeded(PixelFormat format, std::array<SwizzleSource, 4 case PixelFormat::G4R4_UNORM: std::ranges::transform(swizzle, swizzle.begin(), SwapGreenRed); break; + case PixelFormat::A4B4G4R4_UNORM: + if (emulate_a4b4g4r4) { + std::ranges::reverse(swizzle); + } + break; default: break; } @@ -817,10 +844,14 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched : device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_}, staging_buffer_pool{staging_buffer_pool_}, blit_image_helper{blit_image_helper_}, render_pass_cache{render_pass_cache_}, resolution{Settings::values.resolution_info} { - if (Settings::values.accelerate_astc) { + if (Settings::values.accelerate_astc.GetValue() == Settings::AstcDecodeMode::Gpu) { astc_decoder_pass.emplace(device, scheduler, descriptor_pool, staging_buffer_pool, compute_pass_descriptor_queue, memory_allocator); } + if (device.IsStorageImageMultisampleSupported()) { + msaa_copy_pass = std::make_unique<MSAACopyPass>( + device, scheduler, descriptor_pool, staging_buffer_pool, compute_pass_descriptor_queue); + } if (!device.IsKhrImageFormatListSupported()) { return; } @@ -1043,15 +1074,27 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst dst_region, src_region, filter, operation); return; } + ASSERT(src.format == dst.format); if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - if (!device.IsBlitDepthStencilSupported()) { + const auto format = src.format; + const auto can_blit_depth_stencil = [this, format] { + switch (format) { + case VideoCore::Surface::PixelFormat::D24_UNORM_S8_UINT: + case VideoCore::Surface::PixelFormat::S8_UINT_D24_UNORM: + return device.IsBlitDepth24Stencil8Supported(); + case VideoCore::Surface::PixelFormat::D32_FLOAT_S8_UINT: + return device.IsBlitDepth32Stencil8Supported(); + default: + UNREACHABLE(); + } + }(); + if (!can_blit_depth_stencil) { UNIMPLEMENTED_IF(is_src_msaa || is_dst_msaa); blit_image_helper.BlitDepthStencil(dst_framebuffer, src.DepthView(), src.StencilView(), dst_region, src_region, filter, operation); return; } } - ASSERT(src.format == dst.format); ASSERT(!(is_dst_msaa && !is_src_msaa)); ASSERT(operation == Fermi2D::Operation::SrcCopy); @@ -1158,6 +1201,9 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im if (src_view.format == PixelFormat::D24_UNORM_S8_UINT) { return blit_image_helper.ConvertS8D24ToABGR8(dst, src_view); } + if (src_view.format == PixelFormat::D32_FLOAT) { + return blit_image_helper.ConvertD32FToABGR8(dst, src_view); + } break; case PixelFormat::R32_FLOAT: if (src_view.format == PixelFormat::D32_FLOAT) { @@ -1277,7 +1323,11 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src, void TextureCacheRuntime::CopyImageMSAA(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies) { - UNIMPLEMENTED_MSG("Copying images with different samples is not implemented in Vulkan."); + const bool msaa_to_non_msaa = src.info.num_samples > 1 && dst.info.num_samples == 1; + if (msaa_copy_pass) { + return msaa_copy_pass->CopyImage(dst, src, copies, msaa_to_non_msaa); + } + UNIMPLEMENTED_MSG("Copying images with different samples is not supported."); } u64 TextureCacheRuntime::GetDeviceLocalMemory() const { @@ -1301,12 +1351,19 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu runtime->ViewFormats(info.format))), aspect_mask(ImageAspectMask(info.format)) { if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { - if (Settings::values.async_astc.GetValue()) { + switch (Settings::values.accelerate_astc.GetValue()) { + case Settings::AstcDecodeMode::Gpu: + if (Settings::values.astc_recompression.GetValue() == + Settings::AstcRecompression::Uncompressed && + info.size.depth == 1) { + flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; + } + break; + case Settings::AstcDecodeMode::CpuAsynchronous: flags |= VideoCommon::ImageFlagBits::AsynchronousDecode; - } else if (Settings::values.astc_recompression.GetValue() == - Settings::AstcRecompression::Uncompressed && - Settings::values.accelerate_astc.GetValue() && info.size.depth == 1) { - flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; + break; + default: + break; } flags |= VideoCommon::ImageFlagBits::Converted; flags |= VideoCommon::ImageFlagBits::CostlyLoad; @@ -1318,39 +1375,15 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu if (runtime->device.HasDebuggingToolAttached()) { original_image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); } - static constexpr VkImageViewUsageCreateInfo storage_image_view_usage_create_info{ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, - .pNext = nullptr, - .usage = VK_IMAGE_USAGE_STORAGE_BIT, - }; current_image = *original_image; + storage_image_views.resize(info.resources.levels); if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported() && Settings::values.astc_recompression.GetValue() == Settings::AstcRecompression::Uncompressed) { const auto& device = runtime->device.GetLogical(); - storage_image_views.reserve(info.resources.levels); for (s32 level = 0; level < info.resources.levels; ++level) { - storage_image_views.push_back(device.CreateImageView(VkImageViewCreateInfo{ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = &storage_image_view_usage_create_info, - .flags = 0, - .image = *original_image, - .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY, - .format = VK_FORMAT_A8B8G8R8_UNORM_PACK32, - .components{ - .r = VK_COMPONENT_SWIZZLE_IDENTITY, - .g = VK_COMPONENT_SWIZZLE_IDENTITY, - .b = VK_COMPONENT_SWIZZLE_IDENTITY, - .a = VK_COMPONENT_SWIZZLE_IDENTITY, - }, - .subresourceRange{ - .aspectMask = aspect_mask, - .baseMipLevel = static_cast<u32>(level), - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, - })); + storage_image_views[level] = + MakeStorageView(device, level, *original_image, VK_FORMAT_A8B8G8R8_UNORM_PACK32); } } } @@ -1481,6 +1514,17 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferIm DownloadMemory(buffers, offsets, copies); } +VkImageView Image::StorageImageView(s32 level) noexcept { + auto& view = storage_image_views[level]; + if (!view) { + const auto format_info = + MaxwellToVK::SurfaceFormat(runtime->device, FormatType::Optimal, true, info.format); + view = + MakeStorageView(runtime->device.GetLogical(), level, current_image, format_info.format); + } + return *view; +} + bool Image::IsRescaled() const noexcept { return True(flags & ImageFlagBits::Rescaled); } @@ -1618,8 +1662,8 @@ bool Image::NeedsScaleHelper() const { return true; } static constexpr auto OPTIMAL_FORMAT = FormatType::Optimal; - const PixelFormat format = StorageFormat(info.format); - const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format; + const auto vk_format = + MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, info.format).format; const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; const bool needs_blit_helper = !device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT); return needs_blit_helper; @@ -1641,7 +1685,8 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI }; if (!info.IsRenderTarget()) { swizzle = info.Swizzle(); - TryTransformSwizzleIfNeeded(format, swizzle, device->MustEmulateBGR565()); + TryTransformSwizzleIfNeeded(format, swizzle, device->MustEmulateBGR565(), + !device->IsExt4444FormatsSupported()); if ((aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) != 0) { std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed); } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 6621210ea..d6c5a15cc 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -5,11 +5,12 @@ #include <span> +#include "video_core/texture_cache/texture_cache_base.h" + #include "shader_recompiler/shader_info.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/texture_cache/image_view_base.h" -#include "video_core/texture_cache/texture_cache_base.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" @@ -116,6 +117,7 @@ public: BlitImageHelper& blit_image_helper; RenderPassCache& render_pass_cache; std::optional<ASTCDecoderPass> astc_decoder_pass; + std::unique_ptr<MSAACopyPass> msaa_copy_pass; const Settings::ResolutionScalingInfo& resolution; std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats; @@ -160,15 +162,13 @@ public: return aspect_mask; } - [[nodiscard]] VkImageView StorageImageView(s32 level) const noexcept { - return *storage_image_views[level]; - } - /// Returns true when the image is already initialized and mark it as initialized [[nodiscard]] bool ExchangeInitialization() noexcept { return std::exchange(initialized, true); } + VkImageView StorageImageView(s32 level) noexcept; + bool IsRescaled() const noexcept; bool ScaleUp(bool ignore = false); diff --git a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp index 460d8d59d..04a51f2d1 100644 --- a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp +++ b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp @@ -62,7 +62,7 @@ void TurboMode::Run(std::stop_token stop_token) { auto descriptor_pool = dld.CreateDescriptorPool(VkDescriptorPoolCreateInfo{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, .pNext = nullptr, - .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .flags = 0, .maxSets = 1, .poolSizeCount = 1, .pPoolSizes = &pool_size, diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index 01701201d..e81cd031b 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -51,6 +51,11 @@ bool ShaderCache::RefreshStages(std::array<u64, 6>& unique_hashes) { } const auto& shader_config{maxwell3d->regs.pipelines[index]}; const auto program{static_cast<Tegra::Engines::Maxwell3D::Regs::ShaderType>(index)}; + if (program == Tegra::Engines::Maxwell3D::Regs::ShaderType::Pixel && + !maxwell3d->regs.rasterize_enable) { + unique_hashes[index] = 0; + continue; + } const GPUVAddr shader_addr{base_addr + shader_config.offset}; const std::optional<VAddr> cpu_shader_addr{gpu_memory->GpuToCpuAddress(shader_addr)}; if (!cpu_shader_addr) { diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h index de8e08002..a76896620 100644 --- a/src/video_core/shader_cache.h +++ b/src/video_core/shader_cache.h @@ -70,7 +70,7 @@ public: protected: struct GraphicsEnvironments { std::array<GraphicsEnvironment, NUM_PROGRAMS> envs; - std::array<Shader::Environment*, NUM_PROGRAMS> env_ptrs; + std::array<Shader::Environment*, NUM_PROGRAMS> env_ptrs{}; std::span<Shader::Environment* const> Span() const noexcept { return std::span(env_ptrs.begin(), std::ranges::find(env_ptrs, nullptr)); diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp index c7cb56243..4edbe5700 100644 --- a/src/video_core/shader_environment.cpp +++ b/src/video_core/shader_environment.cpp @@ -102,7 +102,8 @@ static std::string_view StageToPrefix(Shader::Stage stage) { } } -static void DumpImpl(u64 hash, const u64* code, u32 read_highest, u32 read_lowest, +static void DumpImpl(u64 pipeline_hash, u64 shader_hash, std::span<const u64> code, + [[maybe_unused]] u32 read_highest, [[maybe_unused]] u32 read_lowest, u32 initial_offset, Shader::Stage stage) { const auto shader_dir{Common::FS::GetYuzuPath(Common::FS::YuzuPath::DumpDir)}; const auto base_dir{shader_dir / "shaders"}; @@ -111,13 +112,18 @@ static void DumpImpl(u64 hash, const u64* code, u32 read_highest, u32 read_lowes return; } const auto prefix = StageToPrefix(stage); - const auto name{base_dir / fmt::format("{}{:016x}.ash", prefix, hash)}; - const size_t real_size = read_highest - read_lowest + initial_offset; - const size_t padding_needed = ((32 - (real_size % 32)) % 32); + const auto name{base_dir / + fmt::format("{:016x}_{}_{:016x}.ash", pipeline_hash, prefix, shader_hash)}; std::fstream shader_file(name, std::ios::out | std::ios::binary); + ASSERT(initial_offset % sizeof(u64) == 0); const size_t jump_index = initial_offset / sizeof(u64); - shader_file.write(reinterpret_cast<const char*>(code + jump_index), real_size); - for (size_t i = 0; i < padding_needed; i++) { + const size_t code_size = code.size_bytes() - initial_offset; + shader_file.write(reinterpret_cast<const char*>(&code[jump_index]), code_size); + + // + 1 instruction, due to the fact that we skip the final self branch instruction in the code, + // but we need to consider it for padding, otherwise nvdisasm rages. + const size_t padding_needed = (32 - ((code_size + INST_SIZE) % 32)) % 32; + for (size_t i = 0; i < INST_SIZE + padding_needed; i++) { shader_file.put(0); } } @@ -197,8 +203,8 @@ u64 GenericEnvironment::CalculateHash() const { return Common::CityHash64(data.get(), size); } -void GenericEnvironment::Dump(u64 hash) { - DumpImpl(hash, code.data(), read_highest, read_lowest, initial_offset, stage); +void GenericEnvironment::Dump(u64 pipeline_hash, u64 shader_hash) { + DumpImpl(pipeline_hash, shader_hash, code, read_highest, read_lowest, initial_offset, stage); } void GenericEnvironment::Serialize(std::ofstream& file) const { @@ -282,6 +288,7 @@ std::optional<u64> GenericEnvironment::TryFindSize() { Tegra::Texture::TICEntry GenericEnvironment::ReadTextureInfo(GPUVAddr tic_addr, u32 tic_limit, bool via_header_index, u32 raw) { const auto handle{Tegra::Texture::TexturePair(raw, via_header_index)}; + ASSERT(handle.first <= tic_limit); const GPUVAddr descriptor_addr{tic_addr + handle.first * sizeof(Tegra::Texture::TICEntry)}; Tegra::Texture::TICEntry entry; gpu_memory->ReadBlock(descriptor_addr, &entry, sizeof(entry)); @@ -465,8 +472,8 @@ void FileEnvironment::Deserialize(std::ifstream& file) { .read(reinterpret_cast<char*>(&read_highest), sizeof(read_highest)) .read(reinterpret_cast<char*>(&viewport_transform_state), sizeof(viewport_transform_state)) .read(reinterpret_cast<char*>(&stage), sizeof(stage)); - code = std::make_unique<u64[]>(Common::DivCeil(code_size, sizeof(u64))); - file.read(reinterpret_cast<char*>(code.get()), code_size); + code.resize(Common::DivCeil(code_size, sizeof(u64))); + file.read(reinterpret_cast<char*>(code.data()), code_size); for (size_t i = 0; i < num_texture_types; ++i) { u32 key; Shader::TextureType type; @@ -509,8 +516,8 @@ void FileEnvironment::Deserialize(std::ifstream& file) { is_propietary_driver = texture_bound == 2; } -void FileEnvironment::Dump(u64 hash) { - DumpImpl(hash, code.get(), read_highest, read_lowest, initial_offset, stage); +void FileEnvironment::Dump(u64 pipeline_hash, u64 shader_hash) { + DumpImpl(pipeline_hash, shader_hash, code, read_highest, read_lowest, initial_offset, stage); } u64 FileEnvironment::ReadInstruction(u32 address) { diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h index a0f61cbda..b90f3d44e 100644 --- a/src/video_core/shader_environment.h +++ b/src/video_core/shader_environment.h @@ -58,7 +58,7 @@ public: [[nodiscard]] u64 CalculateHash() const; - void Dump(u64 hash) override; + void Dump(u64 pipeline_hash, u64 shader_hash) override; void Serialize(std::ofstream& file) const; @@ -188,10 +188,10 @@ public: return cbuf_replacements.size() != 0; } - void Dump(u64 hash) override; + void Dump(u64 pipeline_hash, u64 shader_hash) override; private: - std::unique_ptr<u64[]> code; + std::vector<u64> code; std::unordered_map<u32, Shader::TextureType> texture_types; std::unordered_map<u32, Shader::TexturePixelFormat> texture_pixel_formats; std::unordered_map<u64, u32> cbuf_values; diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index e16cd5e73..5b3c7aa5a 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -85,6 +85,8 @@ PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) { return PixelFormat::S8_UINT; case Tegra::DepthFormat::Z32_FLOAT_X24S8_UINT: return PixelFormat::D32_FLOAT_S8_UINT; + case Tegra::DepthFormat::X8Z24_UNORM: + return PixelFormat::X8_D24_UNORM; default: UNIMPLEMENTED_MSG("Unimplemented format={}", format); return PixelFormat::S8_UINT_D24_UNORM; @@ -202,6 +204,7 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) PixelFormat PixelFormatFromGPUPixelFormat(Service::android::PixelFormat format) { switch (format) { case Service::android::PixelFormat::Rgba8888: + case Service::android::PixelFormat::Rgbx8888: return PixelFormat::A8B8G8R8_UNORM; case Service::android::PixelFormat::Rgb565: return PixelFormat::R5G6B5_UNORM; diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 9b9c4d9bc..a5e8e2f62 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -115,6 +115,7 @@ enum class PixelFormat { // Depth formats D32_FLOAT = MaxColorFormat, D16_UNORM, + X8_D24_UNORM, MaxDepthFormat, @@ -251,6 +252,7 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_WIDTH_TABLE = {{ 1, // E5B9G9R9_FLOAT 1, // D32_FLOAT 1, // D16_UNORM + 1, // X8_D24_UNORM 1, // S8_UINT 1, // D24_UNORM_S8_UINT 1, // S8_UINT_D24_UNORM @@ -360,6 +362,7 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_HEIGHT_TABLE = {{ 1, // E5B9G9R9_FLOAT 1, // D32_FLOAT 1, // D16_UNORM + 1, // X8_D24_UNORM 1, // S8_UINT 1, // D24_UNORM_S8_UINT 1, // S8_UINT_D24_UNORM @@ -469,6 +472,7 @@ constexpr std::array<u8, MaxPixelFormat> BITS_PER_BLOCK_TABLE = {{ 32, // E5B9G9R9_FLOAT 32, // D32_FLOAT 16, // D16_UNORM + 32, // X8_D24_UNORM 8, // S8_UINT 32, // D24_UNORM_S8_UINT 32, // S8_UINT_D24_UNORM diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 11ced6c38..8c774f512 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -138,8 +138,16 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, return PixelFormat::E5B9G9R9_FLOAT; case Hash(TextureFormat::Z32, FLOAT): return PixelFormat::D32_FLOAT; + case Hash(TextureFormat::Z32, FLOAT, UINT, UINT, UINT, LINEAR): + return PixelFormat::D32_FLOAT; case Hash(TextureFormat::Z16, UNORM): return PixelFormat::D16_UNORM; + case Hash(TextureFormat::Z16, UNORM, UINT, UINT, UINT, LINEAR): + return PixelFormat::D16_UNORM; + case Hash(TextureFormat::X8Z24, UNORM): + return PixelFormat::X8_D24_UNORM; + case Hash(TextureFormat::X8Z24, UNORM, UINT, UINT, UINT, LINEAR): + return PixelFormat::X8_D24_UNORM; case Hash(TextureFormat::Z24S8, UINT, UNORM, UNORM, UNORM, LINEAR): return PixelFormat::S8_UINT_D24_UNORM; case Hash(TextureFormat::Z24S8, UINT, UNORM, UINT, UINT, LINEAR): diff --git a/src/video_core/texture_cache/formatter.h b/src/video_core/texture_cache/formatter.h index 9ee57a076..cabbfcb2d 100644 --- a/src/video_core/texture_cache/formatter.h +++ b/src/video_core/texture_cache/formatter.h @@ -211,6 +211,8 @@ struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::str return "D32_FLOAT"; case PixelFormat::D16_UNORM: return "D16_UNORM"; + case PixelFormat::X8_D24_UNORM: + return "X8_D24_UNORM"; case PixelFormat::S8_UINT: return "S8_UINT"; case PixelFormat::D24_UNORM_S8_UINT: diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 55d49d017..0587d7b72 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -41,7 +41,7 @@ enum class ImageFlagBits : u32 { IsRescalable = 1 << 15, AsynchronousDecode = 1 << 16, - IsDecoding = 1 << 17, ///< Is currently being decoded asynchornously. + IsDecoding = 1 << 17, ///< Is currently being decoded asynchronously. }; DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) diff --git a/src/video_core/texture_cache/image_view_base.cpp b/src/video_core/texture_cache/image_view_base.cpp index 0c5f4450d..18b9250f9 100644 --- a/src/video_core/texture_cache/image_view_base.cpp +++ b/src/video_core/texture_cache/image_view_base.cpp @@ -85,6 +85,7 @@ bool ImageViewBase::SupportsAnisotropy() const noexcept { // Depth formats case PixelFormat::D32_FLOAT: case PixelFormat::D16_UNORM: + case PixelFormat::X8_D24_UNORM: // Stencil formats case PixelFormat::S8_UINT: // DepthStencil formats diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 4457b366f..1bdb0def5 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -719,6 +719,7 @@ typename P::ImageView* TextureCache<P>::TryFindFramebufferImageView(VAddr cpu_ad return nullptr; } const auto& image_map_ids = it->second; + boost::container::small_vector<const ImageBase*, 4> valid_images; for (const ImageMapId map_id : image_map_ids) { const ImageMapView& map = slot_map_views[map_id]; const ImageBase& image = slot_images[map.image_id]; @@ -728,8 +729,20 @@ typename P::ImageView* TextureCache<P>::TryFindFramebufferImageView(VAddr cpu_ad if (image.image_view_ids.empty()) { continue; } - return &slot_image_views[image.image_view_ids.at(0)]; + valid_images.push_back(&image); } + + if (valid_images.size() == 1) [[likely]] { + return &slot_image_views[valid_images[0]->image_view_ids.at(0)]; + } + + if (valid_images.size() > 0) [[unlikely]] { + std::ranges::sort(valid_images, [](const auto* a, const auto* b) { + return a->modification_tick > b->modification_tick; + }); + return &slot_image_views[valid_images[0]->image_view_ids.at(0)]; + } + return nullptr; } diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index e9ec91265..a40825c9f 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -243,6 +243,9 @@ public: /// Create channel state. void CreateChannel(Tegra::Control::ChannelState& channel) final override; + /// Prepare an image to be used + void PrepareImage(ImageId image_id, bool is_modification, bool invalidate); + std::recursive_mutex mutex; private: @@ -387,9 +390,6 @@ private: /// Synchronize image aliases, copying data if needed void SynchronizeAliases(ImageId image_id); - /// Prepare an image to be used - void PrepareImage(ImageId image_id, bool is_modification, bool invalidate); - /// Prepare an image view to be used void PrepareImageView(ImageViewId image_view_id, bool is_modification, bool invalidate); diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index a83f5d41c..8151cabf0 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -68,6 +68,7 @@ struct LevelInfo { Extent2D tile_size; u32 bpp_log2; u32 tile_width_spacing; + u32 num_levels; }; [[nodiscard]] constexpr u32 AdjustTileSize(u32 shift, u32 unit_factor, u32 dimension) { @@ -118,11 +119,11 @@ template <u32 GOB_EXTENT> } [[nodiscard]] constexpr Extent3D AdjustMipBlockSize(Extent3D num_tiles, Extent3D block_size, - u32 level) { + u32 level, u32 num_levels) { return { .width = AdjustMipBlockSize<GOB_SIZE_X>(num_tiles.width, block_size.width, level), .height = AdjustMipBlockSize<GOB_SIZE_Y>(num_tiles.height, block_size.height, level), - .depth = level == 0 + .depth = level == 0 && num_levels == 1 ? block_size.depth : AdjustMipBlockSize<GOB_SIZE_Z>(num_tiles.depth, block_size.depth, level), }; @@ -166,13 +167,6 @@ template <u32 GOB_EXTENT> } [[nodiscard]] constexpr Extent3D TileShift(const LevelInfo& info, u32 level) { - if (level == 0) { - return Extent3D{ - .width = info.block.width, - .height = info.block.height, - .depth = info.block.depth, - }; - } const Extent3D blocks = NumLevelBlocks(info, level); return Extent3D{ .width = AdjustTileSize(info.block.width, GOB_SIZE_X, blocks.width), @@ -257,7 +251,7 @@ template <u32 GOB_EXTENT> } [[nodiscard]] constexpr LevelInfo MakeLevelInfo(PixelFormat format, Extent3D size, Extent3D block, - u32 tile_width_spacing) { + u32 tile_width_spacing, u32 num_levels) { const u32 bytes_per_block = BytesPerBlock(format); return { .size = @@ -270,16 +264,18 @@ template <u32 GOB_EXTENT> .tile_size = DefaultBlockSize(format), .bpp_log2 = BytesPerBlockLog2(bytes_per_block), .tile_width_spacing = tile_width_spacing, + .num_levels = num_levels, }; } [[nodiscard]] constexpr LevelInfo MakeLevelInfo(const ImageInfo& info) { - return MakeLevelInfo(info.format, info.size, info.block, info.tile_width_spacing); + return MakeLevelInfo(info.format, info.size, info.block, info.tile_width_spacing, + info.resources.levels); } [[nodiscard]] constexpr u32 CalculateLevelOffset(PixelFormat format, Extent3D size, Extent3D block, u32 tile_width_spacing, u32 level) { - const LevelInfo info = MakeLevelInfo(format, size, block, tile_width_spacing); + const LevelInfo info = MakeLevelInfo(format, size, block, tile_width_spacing, level); u32 offset = 0; for (u32 current_level = 0; current_level < level; ++current_level) { offset += CalculateLevelSize(info, current_level); @@ -466,7 +462,7 @@ template <u32 GOB_EXTENT> }; const u32 bpp_log2 = BytesPerBlockLog2(info.format); const u32 alignment = StrideAlignment(num_tiles, info.block, bpp_log2, info.tile_width_spacing); - const Extent3D mip_block = AdjustMipBlockSize(num_tiles, info.block, 0); + const Extent3D mip_block = AdjustMipBlockSize(num_tiles, info.block, 0, info.resources.levels); return Extent3D{ .width = Common::AlignUpLog2(num_tiles.width, alignment), .height = Common::AlignUpLog2(num_tiles.height, GOB_SIZE_Y_SHIFT + mip_block.height), @@ -533,7 +529,8 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr UNIMPLEMENTED_IF(copy.image_extent != level_size); const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); - const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level); + const Extent3D block = + AdjustMipBlockSize(num_tiles, level_info.block, level, level_info.num_levels); size_t host_offset = copy.buffer_offset; @@ -698,7 +695,7 @@ u32 CalculateLevelStrideAlignment(const ImageInfo& info, u32 level) { const Extent2D tile_size = DefaultBlockSize(info.format); const Extent3D level_size = AdjustMipSize(info.size, level); const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); - const Extent3D block = AdjustMipBlockSize(num_tiles, info.block, level); + const Extent3D block = AdjustMipBlockSize(num_tiles, info.block, level, info.resources.levels); const u32 bpp_log2 = BytesPerBlockLog2(info.format); return StrideAlignment(num_tiles, block, bpp_log2, info.tile_width_spacing); } @@ -887,7 +884,8 @@ boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::Memory .image_extent = level_size, }; const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); - const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level); + const Extent3D block = + AdjustMipBlockSize(num_tiles, level_info.block, level, level_info.num_levels); const u32 stride_alignment = StrideAlignment(num_tiles, info.block, gob, bpp_log2); size_t guest_layer_offset = 0; @@ -1041,7 +1039,7 @@ Extent3D MipBlockSize(const ImageInfo& info, u32 level) { const Extent2D tile_size = DefaultBlockSize(info.format); const Extent3D level_size = AdjustMipSize(info.size, level); const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); - return AdjustMipBlockSize(num_tiles, level_info.block, level); + return AdjustMipBlockSize(num_tiles, level_info.block, level, level_info.num_levels); } boost::container::small_vector<SwizzleParameters, 16> FullUploadSwizzles(const ImageInfo& info) { @@ -1063,7 +1061,8 @@ boost::container::small_vector<SwizzleParameters, 16> FullUploadSwizzles(const I for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); - const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level); + const Extent3D block = + AdjustMipBlockSize(num_tiles, level_info.block, level, level_info.num_levels); params[level] = SwizzleParameters{ .num_tiles = num_tiles, .block = block, @@ -1195,7 +1194,7 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const return std::nullopt; } } else { - // Format comaptibility is not relaxed, ensure we are creating a view on a compatible format + // Format compatibility is not relaxed, ensure we are creating a view on a compatible format if (!IsViewCompatible(existing.format, candidate.format, broken_views, native_bgr)) { return std::nullopt; } @@ -1292,11 +1291,11 @@ u32 MapSizeBytes(const ImageBase& image) { } } -static_assert(CalculateLevelSize(LevelInfo{{1920, 1080, 1}, {0, 2, 0}, {1, 1}, 2, 0}, 0) == +static_assert(CalculateLevelSize(LevelInfo{{1920, 1080, 1}, {0, 2, 0}, {1, 1}, 2, 0, 1}, 0) == 0x7f8000); -static_assert(CalculateLevelSize(LevelInfo{{32, 32, 1}, {0, 0, 4}, {1, 1}, 4, 0}, 0) == 0x40000); +static_assert(CalculateLevelSize(LevelInfo{{32, 32, 1}, {0, 0, 4}, {1, 1}, 4, 0, 1}, 0) == 0x4000); -static_assert(CalculateLevelSize(LevelInfo{{128, 8, 1}, {0, 4, 0}, {1, 1}, 4, 0}, 0) == 0x40000); +static_assert(CalculateLevelSize(LevelInfo{{128, 8, 1}, {0, 4, 0}, {1, 1}, 4, 0, 1}, 0) == 0x4000); static_assert(CalculateLevelOffset(PixelFormat::R8_SINT, {1920, 1080, 1}, {0, 2, 0}, 0, 7) == 0x2afc00); diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp index d8b88d9bc..39c08b5ae 100644 --- a/src/video_core/textures/texture.cpp +++ b/src/video_core/textures/texture.cpp @@ -72,12 +72,12 @@ float TSCEntry::MaxAnisotropy() const noexcept { } const auto anisotropic_settings = Settings::values.max_anisotropy.GetValue(); s32 added_anisotropic{}; - if (anisotropic_settings == 0) { + if (anisotropic_settings == Settings::AnisotropyMode::Automatic) { added_anisotropic = Settings::values.resolution_info.up_scale >> Settings::values.resolution_info.down_shift; added_anisotropic = std::max(added_anisotropic - 1, 0); } else { - added_anisotropic = Settings::values.max_anisotropy.GetValue() - 1U; + added_anisotropic = static_cast<u32>(Settings::values.max_anisotropy.GetValue()) - 1U; } return static_cast<float>(1U << (max_anisotropy + added_anisotropic)); } diff --git a/src/video_core/vulkan_common/vma.cpp b/src/video_core/vulkan_common/vma.cpp index 1fe2cf52b..addf10762 100644 --- a/src/video_core/vulkan_common/vma.cpp +++ b/src/video_core/vulkan_common/vma.cpp @@ -2,7 +2,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later #define VMA_IMPLEMENTATION -#define VMA_STATIC_VULKAN_FUNCTIONS 0 -#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 -#include <vk_mem_alloc.h>
\ No newline at end of file +#include "video_core/vulkan_common/vma.h" diff --git a/src/video_core/vulkan_common/vma.h b/src/video_core/vulkan_common/vma.h new file mode 100644 index 000000000..6e25aa1bd --- /dev/null +++ b/src/video_core/vulkan_common/vma.h @@ -0,0 +1,11 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "video_core/vulkan_common/vulkan.h" + +#define VMA_STATIC_VULKAN_FUNCTIONS 0 +#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 + +#include <vk_mem_alloc.h> diff --git a/src/video_core/vulkan_common/vulkan.h b/src/video_core/vulkan_common/vulkan.h new file mode 100644 index 000000000..62aa13291 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan.h @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#define VK_NO_PROTOTYPES +#ifdef _WIN32 +#define VK_USE_PLATFORM_WIN32_KHR +#elif defined(__APPLE__) +#define VK_USE_PLATFORM_METAL_EXT +#elif defined(__ANDROID__) +#define VK_USE_PLATFORM_ANDROID_KHR +#else +#define VK_USE_PLATFORM_XLIB_KHR +#define VK_USE_PLATFORM_WAYLAND_KHR +#endif + +#include <vulkan/vulkan.h> + +// Sanitize macros +#undef CreateEvent +#undef CreateSemaphore +#undef Always +#undef False +#undef None +#undef True diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp index 67e8065a4..448df2d3a 100644 --- a/src/video_core/vulkan_common/vulkan_debug_callback.cpp +++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp @@ -63,22 +63,6 @@ VkBool32 DebugUtilCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, return VK_FALSE; } -VkBool32 DebugReportCallback(VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objectType, - uint64_t object, size_t location, int32_t messageCode, - const char* pLayerPrefix, const char* pMessage, void* pUserData) { - const VkDebugReportFlagBitsEXT severity = static_cast<VkDebugReportFlagBitsEXT>(flags); - const std::string_view message{pMessage}; - if (severity & VK_DEBUG_REPORT_ERROR_BIT_EXT) { - LOG_CRITICAL(Render_Vulkan, "{}", message); - } else if (severity & VK_DEBUG_REPORT_WARNING_BIT_EXT) { - LOG_WARNING(Render_Vulkan, "{}", message); - } else if (severity & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) { - LOG_INFO(Render_Vulkan, "{}", message); - } else if (severity & VK_DEBUG_REPORT_DEBUG_BIT_EXT) { - LOG_DEBUG(Render_Vulkan, "{}", message); - } - return VK_FALSE; -} } // Anonymous namespace vk::DebugUtilsMessenger CreateDebugUtilsCallback(const vk::Instance& instance) { @@ -98,15 +82,4 @@ vk::DebugUtilsMessenger CreateDebugUtilsCallback(const vk::Instance& instance) { }); } -vk::DebugReportCallback CreateDebugReportCallback(const vk::Instance& instance) { - return instance.CreateDebugReportCallback({ - .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, - .pNext = nullptr, - .flags = VK_DEBUG_REPORT_DEBUG_BIT_EXT | VK_DEBUG_REPORT_INFORMATION_BIT_EXT | - VK_DEBUG_REPORT_ERROR_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT, - .pfnCallback = DebugReportCallback, - .pUserData = nullptr, - }); -} - } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.h b/src/video_core/vulkan_common/vulkan_debug_callback.h index a8af7b406..5e940782f 100644 --- a/src/video_core/vulkan_common/vulkan_debug_callback.h +++ b/src/video_core/vulkan_common/vulkan_debug_callback.h @@ -9,6 +9,4 @@ namespace Vulkan { vk::DebugUtilsMessenger CreateDebugUtilsCallback(const vk::Instance& instance); -vk::DebugReportCallback CreateDebugReportCallback(const vk::Instance& instance); - } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index e04852e01..876cec2e8 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -15,6 +15,7 @@ #include "common/polyfill_ranges.h" #include "common/settings.h" #include "video_core/vulkan_common/nsight_aftermath_tracker.h" +#include "video_core/vulkan_common/vma.h" #include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_wrapper.h" @@ -22,8 +23,6 @@ #include <adrenotools/bcenabler.h> #endif -#include <vk_mem_alloc.h> - namespace Vulkan { using namespace Common::Literals; namespace { @@ -72,12 +71,25 @@ constexpr std::array R8G8B8_SSCALED{ VK_FORMAT_UNDEFINED, }; +constexpr std::array VK_FORMAT_R32G32B32_SFLOAT{ + VK_FORMAT_R32G32B32A32_SFLOAT, + VK_FORMAT_UNDEFINED, +}; + +constexpr std::array VK_FORMAT_A4B4G4R4_UNORM_PACK16{ + VK_FORMAT_R4G4B4A4_UNORM_PACK16, + VK_FORMAT_UNDEFINED, +}; + } // namespace Alternatives enum class NvidiaArchitecture { - AmpereOrNewer, + KeplerOrOlder, + Maxwell, + Pascal, + Volta, Turing, - VoltaOrOlder, + AmpereOrNewer, }; template <typename T> @@ -104,6 +116,10 @@ constexpr const VkFormat* GetFormatAlternatives(VkFormat format) { return Alternatives::R16G16B16_SSCALED.data(); case VK_FORMAT_R8G8B8_SSCALED: return Alternatives::R8G8B8_SSCALED.data(); + case VK_FORMAT_R32G32B32_SFLOAT: + return Alternatives::VK_FORMAT_R32G32B32_SFLOAT.data(); + case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT: + return Alternatives::VK_FORMAT_A4B4G4R4_UNORM_PACK16.data(); default: return nullptr; } @@ -131,6 +147,7 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(vk::Physica VK_FORMAT_A2B10G10R10_UINT_PACK32, VK_FORMAT_A2B10G10R10_UNORM_PACK32, VK_FORMAT_A2B10G10R10_USCALED_PACK32, + VK_FORMAT_A2R10G10B10_UNORM_PACK32, VK_FORMAT_A8B8G8R8_SINT_PACK32, VK_FORMAT_A8B8G8R8_SNORM_PACK32, VK_FORMAT_A8B8G8R8_SRGB_PACK32, @@ -186,6 +203,7 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(vk::Physica VK_FORMAT_BC7_UNORM_BLOCK, VK_FORMAT_D16_UNORM, VK_FORMAT_D16_UNORM_S8_UINT, + VK_FORMAT_X8_D24_UNORM_PACK32, VK_FORMAT_D24_UNORM_S8_UINT, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, @@ -231,6 +249,7 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(vk::Physica VK_FORMAT_R32_SINT, VK_FORMAT_R32_UINT, VK_FORMAT_R4G4B4A4_UNORM_PACK16, + VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT, VK_FORMAT_R4G4_UNORM_PACK8, VK_FORMAT_R5G5B5A1_UNORM_PACK16, VK_FORMAT_R5G6B5_UNORM_PACK16, @@ -306,13 +325,38 @@ NvidiaArchitecture GetNvidiaArchitecture(vk::PhysicalDevice physical, physical.GetProperties2(physical_properties); if (shading_rate_props.primitiveFragmentShadingRateWithMultipleViewports) { // Only Ampere and newer support this feature + // TODO: Find a way to differentiate Ampere and Ada return NvidiaArchitecture::AmpereOrNewer; } - } - if (exts.contains(VK_NV_SHADING_RATE_IMAGE_EXTENSION_NAME)) { return NvidiaArchitecture::Turing; } - return NvidiaArchitecture::VoltaOrOlder; + + if (exts.contains(VK_EXT_BLEND_OPERATION_ADVANCED_EXTENSION_NAME)) { + VkPhysicalDeviceBlendOperationAdvancedPropertiesEXT advanced_blending_props{}; + advanced_blending_props.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BLEND_OPERATION_ADVANCED_PROPERTIES_EXT; + VkPhysicalDeviceProperties2 physical_properties{}; + physical_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + physical_properties.pNext = &advanced_blending_props; + physical.GetProperties2(physical_properties); + if (advanced_blending_props.advancedBlendMaxColorAttachments == 1) { + return NvidiaArchitecture::Maxwell; + } + + if (exts.contains(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME)) { + VkPhysicalDeviceConservativeRasterizationPropertiesEXT conservative_raster_props{}; + conservative_raster_props.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONSERVATIVE_RASTERIZATION_PROPERTIES_EXT; + physical_properties.pNext = &conservative_raster_props; + physical.GetProperties2(physical_properties); + if (conservative_raster_props.degenerateLinesRasterized) { + return NvidiaArchitecture::Volta; + } + return NvidiaArchitecture::Pascal; + } + } + + return NvidiaArchitecture::KeplerOrOlder; } std::vector<const char*> ExtensionListForVulkan( @@ -327,6 +371,43 @@ std::vector<const char*> ExtensionListForVulkan( } // Anonymous namespace +void Device::RemoveExtension(bool& extension, const std::string& extension_name) { + extension = false; + loaded_extensions.erase(extension_name); +} + +void Device::RemoveExtensionIfUnsuitable(bool is_suitable, const std::string& extension_name) { + if (loaded_extensions.contains(extension_name) && !is_suitable) { + LOG_WARNING(Render_Vulkan, "Removing unsuitable extension {}", extension_name); + this->RemoveExtension(is_suitable, extension_name); + } +} + +template <typename Feature> +void Device::RemoveExtensionFeature(bool& extension, Feature& feature, + const std::string& extension_name) { + // Unload extension. + this->RemoveExtension(extension, extension_name); + + // Save sType and pNext for chain. + VkStructureType sType = feature.sType; + void* pNext = feature.pNext; + + // Clear feature struct and restore chain. + feature = {}; + feature.sType = sType; + feature.pNext = pNext; +} + +template <typename Feature> +void Device::RemoveExtensionFeatureIfUnsuitable(bool is_suitable, Feature& feature, + const std::string& extension_name) { + if (loaded_extensions.contains(extension_name) && !is_suitable) { + LOG_WARNING(Render_Vulkan, "Removing features for unsuitable extension {}", extension_name); + this->RemoveExtensionFeature(is_suitable, feature, extension_name); + } +} + Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR surface, const vk::InstanceDispatch& dld_) : instance{instance_}, dld{dld_}, physical{physical_}, @@ -376,7 +457,8 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR first_next = &diagnostics_nv; } - is_blit_depth_stencil_supported = TestDepthStencilBlits(); + is_blit_depth24_stencil8_supported = TestDepthStencilBlits(VK_FORMAT_D24_UNORM_S8_UINT); + is_blit_depth32_stencil8_supported = TestDepthStencilBlits(VK_FORMAT_D32_SFLOAT_S8_UINT); is_optimal_astc_supported = ComputeIsOptimalAstcSupported(); is_warp_potentially_bigger = !extensions.subgroup_size_control || properties.subgroup_size_control.maxSubgroupSize > GuestWarpSize; @@ -398,21 +480,20 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR if (is_qualcomm || is_turnip) { LOG_WARNING(Render_Vulkan, "Qualcomm and Turnip drivers have broken VK_EXT_custom_border_color"); - extensions.custom_border_color = false; - loaded_extensions.erase(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); + RemoveExtensionFeature(extensions.custom_border_color, features.custom_border_color, + VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); } if (is_qualcomm) { must_emulate_scaled_formats = true; LOG_WARNING(Render_Vulkan, "Qualcomm drivers have broken VK_EXT_extended_dynamic_state"); - extensions.extended_dynamic_state = false; - loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); + RemoveExtensionFeature(extensions.extended_dynamic_state, features.extended_dynamic_state, + VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); LOG_WARNING(Render_Vulkan, "Qualcomm drivers have a slow VK_KHR_push_descriptor implementation"); - extensions.push_descriptor = false; - loaded_extensions.erase(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); + RemoveExtension(extensions.push_descriptor, VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); #if defined(ANDROID) && defined(ARCHITECTURE_arm64) // Patch the driver to enable BCn textures. @@ -441,34 +522,25 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR must_emulate_scaled_formats = true; LOG_WARNING(Render_Vulkan, "ARM drivers have broken VK_EXT_extended_dynamic_state"); - extensions.extended_dynamic_state = false; - loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); + RemoveExtensionFeature(extensions.extended_dynamic_state, features.extended_dynamic_state, + VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); LOG_WARNING(Render_Vulkan, "ARM drivers have broken VK_EXT_extended_dynamic_state2"); - features.extended_dynamic_state2.extendedDynamicState2 = false; - features.extended_dynamic_state2.extendedDynamicState2LogicOp = false; - features.extended_dynamic_state2.extendedDynamicState2PatchControlPoints = false; - extensions.extended_dynamic_state2 = false; - loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); + RemoveExtensionFeature(extensions.extended_dynamic_state2, features.extended_dynamic_state2, + VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); } if (is_nvidia) { const u32 nv_major_version = (properties.properties.driverVersion >> 22) & 0x3ff; const auto arch = GetNvidiaArchitecture(physical, supported_extensions); - switch (arch) { - case NvidiaArchitecture::AmpereOrNewer: + if (arch >= NvidiaArchitecture::AmpereOrNewer) { LOG_WARNING(Render_Vulkan, "Ampere and newer have broken float16 math"); features.shader_float16_int8.shaderFloat16 = false; - break; - case NvidiaArchitecture::Turing: - break; - case NvidiaArchitecture::VoltaOrOlder: + } else if (arch <= NvidiaArchitecture::Volta) { if (nv_major_version < 527) { LOG_WARNING(Render_Vulkan, "Volta and older have broken VK_KHR_push_descriptor"); - extensions.push_descriptor = false; - loaded_extensions.erase(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); + RemoveExtension(extensions.push_descriptor, VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); } - break; } if (nv_major_version >= 510) { LOG_WARNING(Render_Vulkan, "NVIDIA Drivers >= 510 do not support MSAA image blits"); @@ -481,8 +553,9 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR if (version < VK_MAKE_API_VERSION(0, 21, 2, 0)) { LOG_WARNING(Render_Vulkan, "RADV versions older than 21.2 have broken VK_EXT_extended_dynamic_state"); - extensions.extended_dynamic_state = false; - loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); + RemoveExtensionFeature(extensions.extended_dynamic_state, + features.extended_dynamic_state, + VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); } } if (extensions.extended_dynamic_state2 && is_radv) { @@ -491,11 +564,9 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR LOG_WARNING( Render_Vulkan, "RADV versions older than 22.3.1 have broken VK_EXT_extended_dynamic_state2"); - features.extended_dynamic_state2.extendedDynamicState2 = false; - features.extended_dynamic_state2.extendedDynamicState2LogicOp = false; - features.extended_dynamic_state2.extendedDynamicState2PatchControlPoints = false; - extensions.extended_dynamic_state2 = false; - loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); + RemoveExtensionFeature(extensions.extended_dynamic_state2, + features.extended_dynamic_state2, + VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); } } if (extensions.extended_dynamic_state2 && is_qualcomm) { @@ -505,11 +576,9 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR // Qualcomm Adreno 7xx drivers do not properly support extended_dynamic_state2. LOG_WARNING(Render_Vulkan, "Qualcomm Adreno 7xx drivers have broken VK_EXT_extended_dynamic_state2"); - features.extended_dynamic_state2.extendedDynamicState2 = false; - features.extended_dynamic_state2.extendedDynamicState2LogicOp = false; - features.extended_dynamic_state2.extendedDynamicState2PatchControlPoints = false; - extensions.extended_dynamic_state2 = false; - loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); + RemoveExtensionFeature(extensions.extended_dynamic_state2, + features.extended_dynamic_state2, + VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); } } if (extensions.extended_dynamic_state3 && is_radv) { @@ -526,6 +595,13 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR dynamic_state3_enables = false; } } + if (extensions.extended_dynamic_state3 && is_amd_driver) { + LOG_WARNING(Render_Vulkan, + "AMD drivers have broken extendedDynamicState3ColorBlendEquation"); + features.extended_dynamic_state3.extendedDynamicState3ColorBlendEnable = false; + features.extended_dynamic_state3.extendedDynamicState3ColorBlendEquation = false; + dynamic_state3_blending = false; + } if (extensions.vertex_input_dynamic_state && is_radv) { // TODO(ameerj): Blacklist only offending driver versions // TODO(ameerj): Confirm if RDNA1 is affected @@ -534,9 +610,9 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR if (is_rdna2) { LOG_WARNING(Render_Vulkan, "RADV has broken VK_EXT_vertex_input_dynamic_state on RDNA2 hardware"); - features.vertex_input_dynamic_state.vertexInputDynamicState = false; - extensions.vertex_input_dynamic_state = false; - loaded_extensions.erase(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); + RemoveExtensionFeature(extensions.vertex_input_dynamic_state, + features.vertex_input_dynamic_state, + VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); } } if (extensions.vertex_input_dynamic_state && is_qualcomm) { @@ -547,21 +623,13 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR LOG_WARNING( Render_Vulkan, "Qualcomm Adreno 7xx drivers have broken VK_EXT_vertex_input_dynamic_state"); - features.vertex_input_dynamic_state.vertexInputDynamicState = false; - extensions.vertex_input_dynamic_state = false; - loaded_extensions.erase(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); + RemoveExtensionFeature(extensions.vertex_input_dynamic_state, + features.vertex_input_dynamic_state, + VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); } } sets_per_pool = 64; - if (extensions.extended_dynamic_state3 && is_amd_driver && - properties.properties.driverVersion >= VK_MAKE_API_VERSION(0, 2, 0, 270)) { - LOG_WARNING(Render_Vulkan, - "AMD drivers after 23.5.2 have broken extendedDynamicState3ColorBlendEquation"); - features.extended_dynamic_state3.extendedDynamicState3ColorBlendEnable = false; - features.extended_dynamic_state3.extendedDynamicState3ColorBlendEquation = false; - dynamic_state3_blending = false; - } if (is_amd_driver) { // AMD drivers need a higher amount of Sets per Pool in certain circumstances like in XC2. sets_per_pool = 96; @@ -577,8 +645,8 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR if (!features.shader_float16_int8.shaderFloat16) { LOG_WARNING(Render_Vulkan, "AMD GCN4 and earlier have broken VK_EXT_sampler_filter_minmax"); - extensions.sampler_filter_minmax = false; - loaded_extensions.erase(VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME); + RemoveExtension(extensions.sampler_filter_minmax, + VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME); } } @@ -586,8 +654,9 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR const u32 version = (properties.properties.driverVersion << 3) >> 3; if (version < VK_MAKE_API_VERSION(27, 20, 100, 0)) { LOG_WARNING(Render_Vulkan, "Intel has broken VK_EXT_vertex_input_dynamic_state"); - extensions.vertex_input_dynamic_state = false; - loaded_extensions.erase(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); + RemoveExtensionFeature(extensions.vertex_input_dynamic_state, + features.vertex_input_dynamic_state, + VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); } } if (features.shader_float16_int8.shaderFloat16 && is_intel_windows) { @@ -614,10 +683,17 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR // mesa/mesa/-/commit/ff91c5ca42bc80aa411cb3fd8f550aa6fdd16bdc LOG_WARNING(Render_Vulkan, "ANV drivers 22.3.0 to 23.1.0 have broken VK_KHR_push_descriptor"); - extensions.push_descriptor = false; - loaded_extensions.erase(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); + RemoveExtension(extensions.push_descriptor, VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); + } + } else if (extensions.push_descriptor && is_nvidia) { + const auto arch = GetNvidiaArchitecture(physical, supported_extensions); + if (arch <= NvidiaArchitecture::Pascal) { + LOG_WARNING(Render_Vulkan, + "Pascal and older architectures have broken VK_KHR_push_descriptor"); + RemoveExtension(extensions.push_descriptor, VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); } } + if (is_mvk) { LOG_WARNING(Render_Vulkan, "MVK driver breaks when using more than 16 vertex attributes/bindings"); @@ -739,14 +815,13 @@ bool Device::ComputeIsOptimalAstcSupported() const { return true; } -bool Device::TestDepthStencilBlits() const { +bool Device::TestDepthStencilBlits(VkFormat format) const { static constexpr VkFormatFeatureFlags required_features = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; const auto test_features = [](VkFormatProperties props) { return (props.optimalTilingFeatures & required_features) == required_features; }; - return test_features(format_properties.at(VK_FORMAT_D32_SFLOAT_S8_UINT)) && - test_features(format_properties.at(VK_FORMAT_D24_UNORM_S8_UINT)); + return test_features(format_properties.at(format)); } bool Device::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, @@ -965,7 +1040,7 @@ bool Device::GetSuitability(bool requires_swapchain) { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR; SetNext(next, properties.push_descriptor); } - if (extensions.subgroup_size_control) { + if (extensions.subgroup_size_control || features.subgroup_size_control.subgroupSizeControl) { properties.subgroup_size_control.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES; SetNext(next, properties.subgroup_size_control); @@ -1009,34 +1084,36 @@ bool Device::GetSuitability(bool requires_swapchain) { return suitable; } -void Device::RemoveExtensionIfUnsuitable(bool is_suitable, const std::string& extension_name) { - if (loaded_extensions.contains(extension_name) && !is_suitable) { - LOG_WARNING(Render_Vulkan, "Removing unsuitable extension {}", extension_name); - loaded_extensions.erase(extension_name); - } -} - void Device::RemoveUnsuitableExtensions() { // VK_EXT_custom_border_color extensions.custom_border_color = features.custom_border_color.customBorderColors && features.custom_border_color.customBorderColorWithoutFormat; - RemoveExtensionIfUnsuitable(extensions.custom_border_color, - VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.custom_border_color, features.custom_border_color, + VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); + + // VK_EXT_depth_bias_control + extensions.depth_bias_control = + features.depth_bias_control.depthBiasControl && + features.depth_bias_control.leastRepresentableValueForceUnormRepresentation; + RemoveExtensionFeatureIfUnsuitable(extensions.depth_bias_control, features.depth_bias_control, + VK_EXT_DEPTH_BIAS_CONTROL_EXTENSION_NAME); // VK_EXT_depth_clip_control extensions.depth_clip_control = features.depth_clip_control.depthClipControl; - RemoveExtensionIfUnsuitable(extensions.depth_clip_control, - VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.depth_clip_control, features.depth_clip_control, + VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); // VK_EXT_extended_dynamic_state extensions.extended_dynamic_state = features.extended_dynamic_state.extendedDynamicState; - RemoveExtensionIfUnsuitable(extensions.extended_dynamic_state, - VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.extended_dynamic_state, + features.extended_dynamic_state, + VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); // VK_EXT_extended_dynamic_state2 extensions.extended_dynamic_state2 = features.extended_dynamic_state2.extendedDynamicState2; - RemoveExtensionIfUnsuitable(extensions.extended_dynamic_state2, - VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.extended_dynamic_state2, + features.extended_dynamic_state2, + VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); // VK_EXT_extended_dynamic_state3 dynamic_state3_blending = @@ -1050,35 +1127,38 @@ void Device::RemoveUnsuitableExtensions() { extensions.extended_dynamic_state3 = dynamic_state3_blending || dynamic_state3_enables; dynamic_state3_blending = dynamic_state3_blending && extensions.extended_dynamic_state3; dynamic_state3_enables = dynamic_state3_enables && extensions.extended_dynamic_state3; - RemoveExtensionIfUnsuitable(extensions.extended_dynamic_state3, - VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.extended_dynamic_state3, + features.extended_dynamic_state3, + VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); // VK_EXT_provoking_vertex extensions.provoking_vertex = features.provoking_vertex.provokingVertexLast && features.provoking_vertex.transformFeedbackPreservesProvokingVertex; - RemoveExtensionIfUnsuitable(extensions.provoking_vertex, - VK_EXT_PROVOKING_VERTEX_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.provoking_vertex, features.provoking_vertex, + VK_EXT_PROVOKING_VERTEX_EXTENSION_NAME); // VK_KHR_shader_atomic_int64 extensions.shader_atomic_int64 = features.shader_atomic_int64.shaderBufferInt64Atomics && features.shader_atomic_int64.shaderSharedInt64Atomics; - RemoveExtensionIfUnsuitable(extensions.shader_atomic_int64, - VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.shader_atomic_int64, features.shader_atomic_int64, + VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME); // VK_EXT_shader_demote_to_helper_invocation extensions.shader_demote_to_helper_invocation = features.shader_demote_to_helper_invocation.shaderDemoteToHelperInvocation; - RemoveExtensionIfUnsuitable(extensions.shader_demote_to_helper_invocation, - VK_EXT_SHADER_DEMOTE_TO_HELPER_INVOCATION_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.shader_demote_to_helper_invocation, + features.shader_demote_to_helper_invocation, + VK_EXT_SHADER_DEMOTE_TO_HELPER_INVOCATION_EXTENSION_NAME); // VK_EXT_subgroup_size_control extensions.subgroup_size_control = features.subgroup_size_control.subgroupSizeControl && properties.subgroup_size_control.minSubgroupSize <= GuestWarpSize && properties.subgroup_size_control.maxSubgroupSize >= GuestWarpSize; - RemoveExtensionIfUnsuitable(extensions.subgroup_size_control, - VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.subgroup_size_control, + features.subgroup_size_control, + VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME); // VK_EXT_transform_feedback extensions.transform_feedback = @@ -1088,24 +1168,27 @@ void Device::RemoveUnsuitableExtensions() { properties.transform_feedback.maxTransformFeedbackBuffers > 0 && properties.transform_feedback.transformFeedbackQueries && properties.transform_feedback.transformFeedbackDraw; - RemoveExtensionIfUnsuitable(extensions.transform_feedback, - VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.transform_feedback, features.transform_feedback, + VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME); // VK_EXT_vertex_input_dynamic_state extensions.vertex_input_dynamic_state = features.vertex_input_dynamic_state.vertexInputDynamicState; - RemoveExtensionIfUnsuitable(extensions.vertex_input_dynamic_state, - VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.vertex_input_dynamic_state, + features.vertex_input_dynamic_state, + VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); // VK_KHR_pipeline_executable_properties if (Settings::values.renderer_shader_feedback.GetValue()) { extensions.pipeline_executable_properties = features.pipeline_executable_properties.pipelineExecutableInfo; - RemoveExtensionIfUnsuitable(extensions.pipeline_executable_properties, - VK_KHR_PIPELINE_EXECUTABLE_PROPERTIES_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.pipeline_executable_properties, + features.pipeline_executable_properties, + VK_KHR_PIPELINE_EXECUTABLE_PROPERTIES_EXTENSION_NAME); } else { - extensions.pipeline_executable_properties = false; - loaded_extensions.erase(VK_KHR_PIPELINE_EXECUTABLE_PROPERTIES_EXTENSION_NAME); + RemoveExtensionFeature(extensions.pipeline_executable_properties, + features.pipeline_executable_properties, + VK_KHR_PIPELINE_EXECUTABLE_PROPERTIES_EXTENSION_NAME); } // VK_KHR_workgroup_memory_explicit_layout @@ -1115,8 +1198,9 @@ void Device::RemoveUnsuitableExtensions() { features.workgroup_memory_explicit_layout.workgroupMemoryExplicitLayout8BitAccess && features.workgroup_memory_explicit_layout.workgroupMemoryExplicitLayout16BitAccess && features.workgroup_memory_explicit_layout.workgroupMemoryExplicitLayoutScalarBlockLayout; - RemoveExtensionIfUnsuitable(extensions.workgroup_memory_explicit_layout, - VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); + RemoveExtensionFeatureIfUnsuitable(extensions.workgroup_memory_explicit_layout, + features.workgroup_memory_explicit_layout, + VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); } void Device::SetupFamilies(VkSurfaceKHR surface) { diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index be3ed45ff..282a2925d 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -20,7 +20,6 @@ VK_DEFINE_HANDLE(VmaAllocator) // Vulkan version in the macro describes the minimum version required for feature availability. // If the Vulkan version is lower than the required version, the named extension is required. #define FOR_EACH_VK_FEATURE_1_1(FEATURE) \ - FEATURE(EXT, SubgroupSizeControl, SUBGROUP_SIZE_CONTROL, subgroup_size_control) \ FEATURE(KHR, 16BitStorage, 16BIT_STORAGE, bit16_storage) \ FEATURE(KHR, ShaderAtomicInt64, SHADER_ATOMIC_INT64, shader_atomic_int64) \ FEATURE(KHR, ShaderDrawParameters, SHADER_DRAW_PARAMETERS, shader_draw_parameters) \ @@ -36,15 +35,18 @@ VK_DEFINE_HANDLE(VmaAllocator) #define FOR_EACH_VK_FEATURE_1_3(FEATURE) \ FEATURE(EXT, ShaderDemoteToHelperInvocation, SHADER_DEMOTE_TO_HELPER_INVOCATION, \ - shader_demote_to_helper_invocation) + shader_demote_to_helper_invocation) \ + FEATURE(EXT, SubgroupSizeControl, SUBGROUP_SIZE_CONTROL, subgroup_size_control) // Define all features which may be used by the implementation and require an extension here. #define FOR_EACH_VK_FEATURE_EXT(FEATURE) \ FEATURE(EXT, CustomBorderColor, CUSTOM_BORDER_COLOR, custom_border_color) \ + FEATURE(EXT, DepthBiasControl, DEPTH_BIAS_CONTROL, depth_bias_control) \ FEATURE(EXT, DepthClipControl, DEPTH_CLIP_CONTROL, depth_clip_control) \ FEATURE(EXT, ExtendedDynamicState, EXTENDED_DYNAMIC_STATE, extended_dynamic_state) \ FEATURE(EXT, ExtendedDynamicState2, EXTENDED_DYNAMIC_STATE_2, extended_dynamic_state2) \ FEATURE(EXT, ExtendedDynamicState3, EXTENDED_DYNAMIC_STATE_3, extended_dynamic_state3) \ + FEATURE(EXT, 4444Formats, 4444_FORMATS, format_a4b4g4r4) \ FEATURE(EXT, IndexTypeUint8, INDEX_TYPE_UINT8, index_type_uint8) \ FEATURE(EXT, LineRasterization, LINE_RASTERIZATION, line_rasterization) \ FEATURE(EXT, PrimitiveTopologyListRestart, PRIMITIVE_TOPOLOGY_LIST_RESTART, \ @@ -60,6 +62,7 @@ VK_DEFINE_HANDLE(VmaAllocator) // Define miscellaneous extensions which may be used by the implementation here. #define FOR_EACH_VK_EXTENSION(EXTENSION) \ + EXTENSION(EXT, CONDITIONAL_RENDERING, conditional_rendering) \ EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization) \ EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted) \ EXTENSION(EXT, MEMORY_BUDGET, memory_budget) \ @@ -92,11 +95,14 @@ VK_DEFINE_HANDLE(VmaAllocator) // Define extensions where the absence of the extension may result in a degraded experience. #define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME) \ + EXTENSION_NAME(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME) \ + EXTENSION_NAME(VK_EXT_DEPTH_BIAS_CONTROL_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME) \ + EXTENSION_NAME(VK_EXT_4444_FORMATS_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_LINE_RASTERIZATION_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME) \ @@ -143,7 +149,11 @@ VK_DEFINE_HANDLE(VmaAllocator) // Define features where the absence of the feature may result in a degraded experience. #define FOR_EACH_VK_RECOMMENDED_FEATURE(FEATURE_NAME) \ FEATURE_NAME(custom_border_color, customBorderColors) \ + FEATURE_NAME(depth_bias_control, depthBiasControl) \ + FEATURE_NAME(depth_bias_control, leastRepresentableValueForceUnormRepresentation) \ + FEATURE_NAME(depth_bias_control, depthBiasExact) \ FEATURE_NAME(extended_dynamic_state, extendedDynamicState) \ + FEATURE_NAME(format_a4b4g4r4, formatA4B4G4R4) \ FEATURE_NAME(index_type_uint8, indexTypeUint8) \ FEATURE_NAME(primitive_topology_list_restart, primitiveTopologyListRestart) \ FEATURE_NAME(provoking_vertex, provokingVertexLast) \ @@ -304,7 +314,7 @@ public: return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY; } - /// Returns true if the device suppors float64 natively. + /// Returns true if the device supports float64 natively. bool IsFloat64Supported() const { return features.features.shaderFloat64; } @@ -319,6 +329,11 @@ public: return features.shader_float16_int8.shaderInt8; } + /// Returns true if the device supports binding multisample images as storage images. + bool IsStorageImageMultisampleSupported() const { + return features.features.shaderStorageImageMultisample; + } + /// Returns true if the device warp size can potentially be bigger than guest's warp size. bool IsWarpSizePotentiallyBiggerThanGuest() const { return is_warp_potentially_bigger; @@ -359,9 +374,14 @@ public: return features.features.depthBounds; } - /// Returns true when blitting from and to depth stencil images is supported. - bool IsBlitDepthStencilSupported() const { - return is_blit_depth_stencil_supported; + /// Returns true when blitting from and to D24S8 images is supported. + bool IsBlitDepth24Stencil8Supported() const { + return is_blit_depth24_stencil8_supported; + } + + /// Returns true when blitting from and to D32S8 images is supported. + bool IsBlitDepth32Stencil8Supported() const { + return is_blit_depth32_stencil8_supported; } /// Returns true if the device supports VK_NV_viewport_swizzle. @@ -449,6 +469,11 @@ public: return extensions.depth_clip_control; } + /// Returns true if the device supports VK_EXT_depth_bias_control. + bool IsExtDepthBiasControlSupported() const { + return extensions.depth_bias_control; + } + /// Returns true if the device supports VK_EXT_shader_viewport_index_layer. bool IsExtShaderViewportIndexLayerSupported() const { return extensions.shader_viewport_index_layer; @@ -488,6 +513,11 @@ public: return extensions.extended_dynamic_state3; } + /// Returns true if the device supports VK_EXT_4444_formats. + bool IsExt4444FormatsSupported() const { + return features.format_a4b4g4r4.formatA4B4G4R4; + } + /// Returns true if the device supports VK_EXT_extended_dynamic_state3. bool IsExtExtendedDynamicState3BlendingSupported() const { return dynamic_state3_blending; @@ -528,6 +558,10 @@ public: return extensions.shader_atomic_int64; } + bool IsExtConditionalRendering() const { + return extensions.conditional_rendering; + } + bool HasTimelineSemaphore() const; /// Returns the minimum supported version of SPIR-V. @@ -600,6 +634,10 @@ public: return features.robustness2.nullDescriptor; } + bool HasExactDepthBiasControl() const { + return features.depth_bias_control.depthBiasExact; + } + u32 GetMaxVertexInputAttributes() const { return properties.properties.limits.maxVertexInputAttributes; } @@ -639,8 +677,17 @@ private: // Remove extensions which have incomplete feature support. void RemoveUnsuitableExtensions(); + + void RemoveExtension(bool& extension, const std::string& extension_name); void RemoveExtensionIfUnsuitable(bool is_suitable, const std::string& extension_name); + template <typename Feature> + void RemoveExtensionFeature(bool& extension, Feature& feature, + const std::string& extension_name); + template <typename Feature> + void RemoveExtensionFeatureIfUnsuitable(bool is_suitable, Feature& feature, + const std::string& extension_name); + /// Sets up queue families. void SetupFamilies(VkSurfaceKHR surface); @@ -657,7 +704,7 @@ private: bool ComputeIsOptimalAstcSupported() const; /// Returns true if the device natively supports blitting depth stencil images. - bool TestDepthStencilBlits() const; + bool TestDepthStencilBlits(VkFormat format) const; private: VkInstance instance; ///< Vulkan instance. @@ -721,25 +768,26 @@ private: VkPhysicalDeviceProperties2 properties2{}; // Misc features - bool is_optimal_astc_supported{}; ///< Support for all guest ASTC formats. - bool is_blit_depth_stencil_supported{}; ///< Support for blitting from and to depth stencil. - bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. - bool is_integrated{}; ///< Is GPU an iGPU. - bool is_virtual{}; ///< Is GPU a virtual GPU. - bool is_non_gpu{}; ///< Is SoftwareRasterizer, FPGA, non-GPU device. - bool has_broken_compute{}; ///< Compute shaders can cause crashes - bool has_broken_cube_compatibility{}; ///< Has broken cube compatibility bit - bool has_renderdoc{}; ///< Has RenderDoc attached - bool has_nsight_graphics{}; ///< Has Nsight Graphics attached - bool supports_d24_depth{}; ///< Supports D24 depth buffers. - bool cant_blit_msaa{}; ///< Does not support MSAA<->MSAA blitting. - bool must_emulate_scaled_formats{}; ///< Requires scaled vertex format emulation - bool must_emulate_bgr565{}; ///< Emulates BGR565 by swizzling RGB565 format. - bool dynamic_state3_blending{}; ///< Has all blending features of dynamic_state3. - bool dynamic_state3_enables{}; ///< Has all enables features of dynamic_state3. - bool supports_conditional_barriers{}; ///< Allows barriers in conditional control flow. - u64 device_access_memory{}; ///< Total size of device local memory in bytes. - u32 sets_per_pool{}; ///< Sets per Description Pool + bool is_optimal_astc_supported{}; ///< Support for all guest ASTC formats. + bool is_blit_depth24_stencil8_supported{}; ///< Support for blitting from and to D24S8. + bool is_blit_depth32_stencil8_supported{}; ///< Support for blitting from and to D32S8. + bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. + bool is_integrated{}; ///< Is GPU an iGPU. + bool is_virtual{}; ///< Is GPU a virtual GPU. + bool is_non_gpu{}; ///< Is SoftwareRasterizer, FPGA, non-GPU device. + bool has_broken_compute{}; ///< Compute shaders can cause crashes + bool has_broken_cube_compatibility{}; ///< Has broken cube compatibility bit + bool has_renderdoc{}; ///< Has RenderDoc attached + bool has_nsight_graphics{}; ///< Has Nsight Graphics attached + bool supports_d24_depth{}; ///< Supports D24 depth buffers. + bool cant_blit_msaa{}; ///< Does not support MSAA<->MSAA blitting. + bool must_emulate_scaled_formats{}; ///< Requires scaled vertex format emulation + bool must_emulate_bgr565{}; ///< Emulates BGR565 by swizzling RGB565 format. + bool dynamic_state3_blending{}; ///< Has all blending features of dynamic_state3. + bool dynamic_state3_enables{}; ///< Has all enables features of dynamic_state3. + bool supports_conditional_barriers{}; ///< Allows barriers in conditional control flow. + u64 device_access_memory{}; ///< Total size of device local memory in bytes. + u32 sets_per_pool{}; ///< Sets per Description Pool // Telemetry parameters std::set<std::string, std::less<>> supported_extensions; ///< Reported Vulkan extensions. diff --git a/src/video_core/vulkan_common/vulkan_instance.cpp b/src/video_core/vulkan_common/vulkan_instance.cpp index 6a294c1da..180657a75 100644 --- a/src/video_core/vulkan_common/vulkan_instance.cpp +++ b/src/video_core/vulkan_common/vulkan_instance.cpp @@ -14,19 +14,6 @@ #include "video_core/vulkan_common/vulkan_instance.h" #include "video_core/vulkan_common/vulkan_wrapper.h" -// Include these late to avoid polluting previous headers -#if defined(_WIN32) -#include <windows.h> -// ensure include order -#include <vulkan/vulkan_win32.h> -#elif defined(__ANDROID__) -#include <vulkan/vulkan_android.h> -#elif !defined(__APPLE__) -#include <X11/Xlib.h> -#include <vulkan/vulkan_wayland.h> -#include <vulkan/vulkan_xlib.h> -#endif - namespace Vulkan { namespace { @@ -54,9 +41,6 @@ namespace { bool enable_validation) { std::vector<const char*> extensions; extensions.reserve(6); -#ifdef __APPLE__ - extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); -#endif switch (window_type) { case Core::Frontend::WindowSystemType::Headless: break; @@ -87,11 +71,14 @@ namespace { if (window_type != Core::Frontend::WindowSystemType::Headless) { extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); } - if (enable_validation) { - const bool debug_utils = - AreExtensionsSupported(dld, std::array{VK_EXT_DEBUG_UTILS_EXTENSION_NAME}); - extensions.push_back(debug_utils ? VK_EXT_DEBUG_UTILS_EXTENSION_NAME - : VK_EXT_DEBUG_REPORT_EXTENSION_NAME); +#ifdef __APPLE__ + if (AreExtensionsSupported(dld, std::array{VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME})) { + extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); + } +#endif + if (enable_validation && + AreExtensionsSupported(dld, std::array{VK_EXT_DEBUG_UTILS_EXTENSION_NAME})) { + extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); } return extensions; } diff --git a/src/video_core/vulkan_common/vulkan_library.cpp b/src/video_core/vulkan_common/vulkan_library.cpp index 47f6f2a03..0130f6a0d 100644 --- a/src/video_core/vulkan_common/vulkan_library.cpp +++ b/src/video_core/vulkan_common/vulkan_library.cpp @@ -19,13 +19,17 @@ std::shared_ptr<Common::DynamicLibrary> OpenLibrary( #else auto library = std::make_shared<Common::DynamicLibrary>(); #ifdef __APPLE__ + const auto libvulkan_filename = + Common::FS::GetBundleDirectory() / "Contents/Frameworks/libvulkan.1.dylib"; + const auto libmoltenvk_filename = + Common::FS::GetBundleDirectory() / "Contents/Frameworks/libMoltenVK.dylib"; + const char* library_paths[] = {std::getenv("LIBVULKAN_PATH"), libvulkan_filename.c_str(), + libmoltenvk_filename.c_str()}; // Check if a path to a specific Vulkan library has been specified. - char* const libvulkan_env = std::getenv("LIBVULKAN_PATH"); - if (!libvulkan_env || !library->Open(libvulkan_env)) { - // Use the libvulkan.dylib from the application bundle. - const auto filename = - Common::FS::GetBundleDirectory() / "Contents/Frameworks/libvulkan.dylib"; - void(library->Open(Common::FS::PathToUTF8String(filename).c_str())); + for (const auto& library_path : library_paths) { + if (library_path && library->Open(library_path)) { + break; + } } #else std::string filename = Common::DynamicLibrary::GetVersionedFilename("vulkan", 1); diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index 42f3ee0b4..82767fdf0 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -9,14 +9,14 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" +#include "common/literals.h" #include "common/logging/log.h" #include "common/polyfill_ranges.h" +#include "video_core/vulkan_common/vma.h" #include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" -#include <vk_mem_alloc.h> - namespace Vulkan { namespace { struct Range { @@ -70,8 +70,7 @@ struct Range { case MemoryUsage::Download: return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; case MemoryUsage::DeviceLocal: - return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | - VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT; + return {}; } return {}; } @@ -213,7 +212,20 @@ MemoryAllocator::MemoryAllocator(const Device& device_) : device{device_}, allocator{device.GetAllocator()}, properties{device_.GetPhysical().GetMemoryProperties().memoryProperties}, buffer_image_granularity{ - device_.GetPhysical().GetProperties().limits.bufferImageGranularity} {} + device_.GetPhysical().GetProperties().limits.bufferImageGranularity} { + // GPUs not supporting rebar may only have a region with less than 256MB host visible/device + // local memory. In that case, opening 2 RenderDoc captures side-by-side is not possible due to + // the heap running out of memory. With RenderDoc attached and only a small host/device region, + // only allow the stream buffer in this memory heap. + if (device.HasDebuggingToolAttached()) { + using namespace Common::Literals; + ForEachDeviceLocalHostVisibleHeap(device, [this](size_t index, VkMemoryHeap& heap) { + if (heap.size <= 256_MiB) { + valid_memory_types &= ~(1u << index); + } + }); + } +} MemoryAllocator::~MemoryAllocator() = default; @@ -245,7 +257,7 @@ vk::Buffer MemoryAllocator::CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsa .usage = MemoryUsageVma(usage), .requiredFlags = 0, .preferredFlags = MemoryUsagePreferedVmaFlags(usage), - .memoryTypeBits = 0, + .memoryTypeBits = usage == MemoryUsage::Stream ? 0u : valid_memory_types, .pool = VK_NULL_HANDLE, .pUserData = nullptr, .priority = 0.f, diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h index f449bc8d0..38a182bcb 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.h +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h @@ -7,6 +7,7 @@ #include <span> #include <vector> #include "common/common_types.h" +#include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_wrapper.h" VK_DEFINE_HANDLE(VmaAllocator) @@ -26,6 +27,18 @@ enum class MemoryUsage { Stream, ///< Requests device local host visible buffer, falling back host memory. }; +template <typename F> +void ForEachDeviceLocalHostVisibleHeap(const Device& device, F&& f) { + auto memory_props = device.GetPhysical().GetMemoryProperties().memoryProperties; + for (size_t i = 0; i < memory_props.memoryTypeCount; i++) { + auto& memory_type = memory_props.memoryTypes[i]; + if ((memory_type.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) && + (memory_type.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) { + f(memory_type.heapIndex, memory_props.memoryHeaps[memory_type.heapIndex]); + } + } +} + /// Ownership handle of a memory commitment. /// Points to a subregion of a memory allocation. class MemoryCommit { @@ -124,6 +137,7 @@ private: std::vector<std::unique_ptr<MemoryAllocation>> allocations; ///< Current allocations. VkDeviceSize buffer_image_granularity; // The granularity for adjacent offsets between buffers // and optimal images + u32 valid_memory_types{~0u}; }; } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_surface.cpp b/src/video_core/vulkan_common/vulkan_surface.cpp index cfea4cd7b..e45f8e43f 100644 --- a/src/video_core/vulkan_common/vulkan_surface.cpp +++ b/src/video_core/vulkan_common/vulkan_surface.cpp @@ -6,19 +6,6 @@ #include "video_core/vulkan_common/vulkan_surface.h" #include "video_core/vulkan_common/vulkan_wrapper.h" -// Include these late to avoid polluting previous headers -#ifdef _WIN32 -#include <windows.h> -// ensure include order -#include <vulkan/vulkan_win32.h> -#elif defined(__ANDROID__) -#include <vulkan/vulkan_android.h> -#elif !defined(__APPLE__) -#include <X11/Xlib.h> -#include <vulkan/vulkan_wayland.h> -#include <vulkan/vulkan_xlib.h> -#endif - namespace Vulkan { vk::SurfaceKHR CreateSurface( diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 2fa29793a..2f3254a97 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -9,11 +9,9 @@ #include "common/common_types.h" #include "common/logging/log.h" - +#include "video_core/vulkan_common/vma.h" #include "video_core/vulkan_common/vulkan_wrapper.h" -#include <vk_mem_alloc.h> - namespace Vulkan::vk { namespace { @@ -77,6 +75,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkBeginCommandBuffer); X(vkBindBufferMemory); X(vkBindImageMemory); + X(vkCmdBeginConditionalRenderingEXT); X(vkCmdBeginQuery); X(vkCmdBeginRenderPass); X(vkCmdBeginTransformFeedbackEXT); @@ -93,13 +92,17 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdCopyBufferToImage); X(vkCmdCopyImage); X(vkCmdCopyImageToBuffer); + X(vkCmdCopyQueryPoolResults); X(vkCmdDispatch); + X(vkCmdDispatchIndirect); X(vkCmdDraw); X(vkCmdDrawIndexed); X(vkCmdDrawIndirect); X(vkCmdDrawIndexedIndirect); X(vkCmdDrawIndirectCount); X(vkCmdDrawIndexedIndirectCount); + X(vkCmdDrawIndirectByteCountEXT); + X(vkCmdEndConditionalRenderingEXT); X(vkCmdEndQuery); X(vkCmdEndRenderPass); X(vkCmdEndTransformFeedbackEXT); @@ -110,6 +113,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdPushDescriptorSetWithTemplateKHR); X(vkCmdSetBlendConstants); X(vkCmdSetDepthBias); + X(vkCmdSetDepthBias2EXT); X(vkCmdSetDepthBounds); X(vkCmdSetEvent); X(vkCmdSetScissor); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 32bd75ad8..0487cd3b6 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -12,23 +12,8 @@ #include <utility> #include <vector> -#define VK_NO_PROTOTYPES -#ifdef _WIN32 -#define VK_USE_PLATFORM_WIN32_KHR -#elif defined(__APPLE__) -#define VK_USE_PLATFORM_METAL_EXT -#endif -#include <vulkan/vulkan.h> - -// Sanitize macros -#ifdef CreateEvent -#undef CreateEvent -#endif -#ifdef CreateSemaphore -#undef CreateSemaphore -#endif - #include "common/common_types.h" +#include "video_core/vulkan_common/vulkan.h" #ifdef _MSC_VER #pragma warning(disable : 26812) // Disable prefer enum class over enum @@ -132,6 +117,9 @@ public: virtual ~Exception() = default; const char* what() const noexcept override; + VkResult GetResult() const noexcept { + return result; + } private: VkResult result; @@ -200,6 +188,7 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; PFN_vkBindBufferMemory vkBindBufferMemory{}; PFN_vkBindImageMemory vkBindImageMemory{}; + PFN_vkCmdBeginConditionalRenderingEXT vkCmdBeginConditionalRenderingEXT{}; PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; PFN_vkCmdBeginQuery vkCmdBeginQuery{}; PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; @@ -217,13 +206,17 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; PFN_vkCmdCopyImage vkCmdCopyImage{}; PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; + PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults{}; PFN_vkCmdDispatch vkCmdDispatch{}; + PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{}; PFN_vkCmdDraw vkCmdDraw{}; PFN_vkCmdDrawIndexed vkCmdDrawIndexed{}; PFN_vkCmdDrawIndirect vkCmdDrawIndirect{}; PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; + PFN_vkCmdDrawIndirectByteCountEXT vkCmdDrawIndirectByteCountEXT{}; + PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{}; PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; PFN_vkCmdEndQuery vkCmdEndQuery{}; PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; @@ -236,6 +229,7 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdSetBlendConstants vkCmdSetBlendConstants{}; PFN_vkCmdSetCullModeEXT vkCmdSetCullModeEXT{}; PFN_vkCmdSetDepthBias vkCmdSetDepthBias{}; + PFN_vkCmdSetDepthBias2EXT vkCmdSetDepthBias2EXT{}; PFN_vkCmdSetDepthBounds vkCmdSetDepthBounds{}; PFN_vkCmdSetDepthBoundsTestEnableEXT vkCmdSetDepthBoundsTestEnableEXT{}; PFN_vkCmdSetDepthCompareOpEXT vkCmdSetDepthCompareOpEXT{}; @@ -1196,6 +1190,13 @@ public: count_offset, draw_count, stride); } + void DrawIndirectByteCountEXT(u32 instance_count, u32 first_instance, VkBuffer counter_buffer, + VkDeviceSize counter_buffer_offset, u32 counter_offset, + u32 stride) { + dld->vkCmdDrawIndirectByteCountEXT(handle, instance_count, first_instance, counter_buffer, + counter_buffer_offset, counter_offset, stride); + } + void ClearAttachments(Span<VkClearAttachment> attachments, Span<VkClearRect> rects) const noexcept { dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), @@ -1224,6 +1225,10 @@ public: dld->vkCmdDispatch(handle, x, y, z); } + void DispatchIndirect(VkBuffer indirect_buffer, VkDeviceSize offset) const noexcept { + dld->vkCmdDispatchIndirect(handle, indirect_buffer, offset); + } + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkDependencyFlags dependency_flags, Span<VkMemoryBarrier> memory_barriers, Span<VkBufferMemoryBarrier> buffer_barriers, @@ -1280,6 +1285,13 @@ public: regions.data()); } + void CopyQueryPoolResults(VkQueryPool query_pool, u32 first_query, u32 query_count, + VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize stride, + VkQueryResultFlags flags) const noexcept { + dld->vkCmdCopyQueryPoolResults(handle, query_pool, first_query, query_count, dst_buffer, + dst_offset, stride, flags); + } + void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, u32 data) const noexcept { dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); @@ -1325,6 +1337,18 @@ public: dld->vkCmdSetDepthBias(handle, constant_factor, clamp, slope_factor); } + void SetDepthBias(float constant_factor, float clamp, float slope_factor, + VkDepthBiasRepresentationInfoEXT* extra) const noexcept { + VkDepthBiasInfoEXT info{ + .sType = VK_STRUCTURE_TYPE_DEPTH_BIAS_INFO_EXT, + .pNext = extra, + .depthBiasConstantFactor = constant_factor, + .depthBiasClamp = clamp, + .depthBiasSlopeFactor = slope_factor, + }; + dld->vkCmdSetDepthBias2EXT(handle, &info); + } + void SetDepthBounds(float min_depth_bounds, float max_depth_bounds) const noexcept { dld->vkCmdSetDepthBounds(handle, min_depth_bounds, max_depth_bounds); } @@ -1458,6 +1482,15 @@ public: counter_buffers, counter_buffer_offsets); } + void BeginConditionalRenderingEXT( + const VkConditionalRenderingBeginInfoEXT& info) const noexcept { + dld->vkCmdBeginConditionalRenderingEXT(handle, &info); + } + + void EndConditionalRenderingEXT() const noexcept { + dld->vkCmdEndConditionalRenderingEXT(handle); + } + void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept { const VkDebugUtilsLabelEXT label_info{ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, |