diff options
Diffstat (limited to 'src/video_core')
23 files changed, 520 insertions, 189 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 40e6d1ec4..cb8b46edf 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -82,6 +82,7 @@ add_library(video_core STATIC gpu_thread.h memory_manager.cpp memory_manager.h + pte_kind.h query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h diff --git a/src/video_core/dirty_flags.cpp b/src/video_core/dirty_flags.cpp index 1039e036f..c2ecc12f5 100644 --- a/src/video_core/dirty_flags.cpp +++ b/src/video_core/dirty_flags.cpp @@ -61,7 +61,7 @@ void SetupDirtyRenderTargets(Maxwell3D::DirtyState::Tables& tables) { } void SetupDirtyShaders(Maxwell3D::DirtyState::Tables& tables) { - FillBlock(tables[0], OFF(pipelines), NUM(pipelines) * Maxwell3D::Regs::MaxShaderProgram, + FillBlock(tables[0], OFF(pipelines), NUM(pipelines[0]) * Maxwell3D::Regs::MaxShaderProgram, Shaders); } } // Anonymous namespace diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index fdf470913..b1a22b76c 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -74,15 +74,15 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.stencil_front_op.zfail = Regs::StencilOp::Op::Keep_D3D; regs.stencil_front_op.zpass = Regs::StencilOp::Op::Keep_D3D; regs.stencil_front_op.func = Regs::ComparisonOp::Always_GL; - regs.stencil_front_func.func_mask = 0xFFFFFFFF; - regs.stencil_front_func.mask = 0xFFFFFFFF; + regs.stencil_front_func_mask = 0xFFFFFFFF; + regs.stencil_front_mask = 0xFFFFFFFF; regs.stencil_two_side_enable = 1; regs.stencil_back_op.fail = Regs::StencilOp::Op::Keep_D3D; regs.stencil_back_op.zfail = Regs::StencilOp::Op::Keep_D3D; regs.stencil_back_op.zpass = Regs::StencilOp::Op::Keep_D3D; regs.stencil_back_op.func = Regs::ComparisonOp::Always_GL; - regs.stencil_back_func.func_mask = 0xFFFFFFFF; - regs.stencil_back_func.mask = 0xFFFFFFFF; + regs.stencil_back_func_mask = 0xFFFFFFFF; + regs.stencil_back_mask = 0xFFFFFFFF; regs.depth_test_func = Regs::ComparisonOp::Always_GL; regs.gl_front_face = Regs::FrontFace::CounterClockWise; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index efe1073b0..75e3b868d 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -390,7 +390,7 @@ public: FractionalEven = 2, }; - enum class OutputPrimitves : u32 { + enum class OutputPrimitives : u32 { Points = 0, Lines = 1, Triangles_CW = 2, @@ -401,7 +401,7 @@ public: union { BitField<0, 2, DomainType> domain_type; BitField<4, 2, Spacing> spacing; - BitField<8, 2, OutputPrimitves> output_primitives; + BitField<8, 2, OutputPrimitives> output_primitives; }; } params; @@ -1795,12 +1795,6 @@ public: ComparisonOp func; }; - struct StencilFunc { - s32 ref; - u32 func_mask; - u32 mask; - }; - struct PsSaturate { // Opposite of DepthMode enum class Depth : u32 { @@ -2737,7 +2731,9 @@ public: u32 post_z_pixel_imask; ///< 0x0F1C INSERT_PADDING_BYTES_NOINIT(0x20); ConstantColorRendering const_color_rendering; ///< 0x0F40 - StencilFunc stencil_back_func; ///< 0x0F54 + s32 stencil_back_ref; ///< 0x0F54 + u32 stencil_back_mask; ///< 0x0F58 + u32 stencil_back_func_mask; ///< 0x0F5C INSERT_PADDING_BYTES_NOINIT(0x24); VertexStreamSubstitute vertex_stream_substitute; ///< 0x0F84 u32 line_mode_clip_generated_edge_do_not_draw; ///< 0x0F8C @@ -2855,7 +2851,9 @@ public: Blend blend; ///< 0x133C u32 stencil_enable; ///< 0x1380 StencilOp stencil_front_op; ///< 0x1384 - StencilFunc stencil_front_func; ///< 0x1394 + s32 stencil_front_ref; ///< 0x1394 + s32 stencil_front_func_mask; ///< 0x1398 + s32 stencil_front_mask; ///< 0x139C INSERT_PADDING_BYTES_NOINIT(0x4); u32 draw_auto_start_byte_count; ///< 0x13A4 PsSaturate frag_color_clamp; ///< 0x13A8 @@ -3311,7 +3309,9 @@ ASSERT_REG_POSITION(vpc_perf, 0x0F14); ASSERT_REG_POSITION(pm_local_trigger, 0x0F18); ASSERT_REG_POSITION(post_z_pixel_imask, 0x0F1C); ASSERT_REG_POSITION(const_color_rendering, 0x0F40); -ASSERT_REG_POSITION(stencil_back_func, 0x0F54); +ASSERT_REG_POSITION(stencil_back_ref, 0x0F54); +ASSERT_REG_POSITION(stencil_back_mask, 0x0F58); +ASSERT_REG_POSITION(stencil_back_func_mask, 0x0F5C); ASSERT_REG_POSITION(vertex_stream_substitute, 0x0F84); ASSERT_REG_POSITION(line_mode_clip_generated_edge_do_not_draw, 0x0F8C); ASSERT_REG_POSITION(color_mask_common, 0x0F90); @@ -3416,7 +3416,9 @@ ASSERT_REG_POSITION(invalidate_texture_data_cache_lines, 0x1338); ASSERT_REG_POSITION(blend, 0x133C); ASSERT_REG_POSITION(stencil_enable, 0x1380); ASSERT_REG_POSITION(stencil_front_op, 0x1384); -ASSERT_REG_POSITION(stencil_front_func, 0x1394); +ASSERT_REG_POSITION(stencil_front_ref, 0x1394); +ASSERT_REG_POSITION(stencil_front_func_mask, 0x1398); +ASSERT_REG_POSITION(stencil_front_mask, 0x139C); ASSERT_REG_POSITION(draw_auto_start_byte_count, 0x13A4); ASSERT_REG_POSITION(frag_color_clamp, 0x13A8); ASSERT_REG_POSITION(window_origin, 0x13AC); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 3909d36c1..4eb7a100d 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -56,66 +56,85 @@ void MaxwellDMA::Launch() { ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE); ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED); - const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; - const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; - - if (!is_src_pitch && !is_dst_pitch) { - // If both the source and the destination are in block layout, assert. - UNIMPLEMENTED_MSG("Tiled->Tiled DMA transfers are not yet implemented"); - return; - } + if (launch.multi_line_enable) { + const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; + const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; + + if (!is_src_pitch && !is_dst_pitch) { + // If both the source and the destination are in block layout, assert. + UNIMPLEMENTED_MSG("Tiled->Tiled DMA transfers are not yet implemented"); + return; + } - if (is_src_pitch && is_dst_pitch) { - CopyPitchToPitch(); + if (is_src_pitch && is_dst_pitch) { + for (u32 line = 0; line < regs.line_count; ++line) { + const GPUVAddr source_line = + regs.offset_in + static_cast<size_t>(line) * regs.pitch_in; + const GPUVAddr dest_line = + regs.offset_out + static_cast<size_t>(line) * regs.pitch_out; + memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in); + } + } else { + if (!is_src_pitch && is_dst_pitch) { + CopyBlockLinearToPitch(); + } else { + CopyPitchToBlockLinear(); + } + } } else { - ASSERT(launch.multi_line_enable == 1); - - if (!is_src_pitch && is_dst_pitch) { - CopyBlockLinearToPitch(); + // TODO: allow multisized components. + auto& accelerate = rasterizer->AccessAccelerateDMA(); + const bool is_const_a_dst = regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A; + if (regs.launch_dma.remap_enable != 0 && is_const_a_dst) { + ASSERT(regs.remap_const.component_size_minus_one == 3); + accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value); + std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value); + memory_manager.WriteBlockUnsafe(regs.offset_out, + reinterpret_cast<u8*>(tmp_buffer.data()), + regs.line_length_in * sizeof(u32)); } else { - CopyPitchToBlockLinear(); + auto convert_linear_2_blocklinear_addr = [](u64 address) { + return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) | + ((address & 0x180) >> 1) | ((address & 0x20) << 3); + }; + auto src_kind = memory_manager.GetPageKind(regs.offset_in); + auto dst_kind = memory_manager.GetPageKind(regs.offset_out); + const bool is_src_pitch = IsPitchKind(static_cast<PTEKind>(src_kind)); + const bool is_dst_pitch = IsPitchKind(static_cast<PTEKind>(dst_kind)); + if (!is_src_pitch && is_dst_pitch) { + std::vector<u8> tmp_buffer(regs.line_length_in); + std::vector<u8> dst_buffer(regs.line_length_in); + memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), + regs.line_length_in); + for (u32 offset = 0; offset < regs.line_length_in; ++offset) { + dst_buffer[offset] = + tmp_buffer[convert_linear_2_blocklinear_addr(regs.offset_in + offset) - + regs.offset_in]; + } + memory_manager.WriteBlock(regs.offset_out, dst_buffer.data(), regs.line_length_in); + } else if (is_src_pitch && !is_dst_pitch) { + std::vector<u8> tmp_buffer(regs.line_length_in); + std::vector<u8> dst_buffer(regs.line_length_in); + memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), + regs.line_length_in); + for (u32 offset = 0; offset < regs.line_length_in; ++offset) { + dst_buffer[convert_linear_2_blocklinear_addr(regs.offset_out + offset) - + regs.offset_out] = tmp_buffer[offset]; + } + memory_manager.WriteBlock(regs.offset_out, dst_buffer.data(), regs.line_length_in); + } else { + if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { + std::vector<u8> tmp_buffer(regs.line_length_in); + memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), + regs.line_length_in); + memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), + regs.line_length_in); + } + } } } - ReleaseSemaphore(); -} -void MaxwellDMA::CopyPitchToPitch() { - // When `multi_line_enable` bit is enabled we copy a 2D image of dimensions - // (line_length_in, line_count). - // Otherwise the copy is performed as if we were copying a 1D buffer of length line_length_in. - const bool remap_enabled = regs.launch_dma.remap_enable != 0; - if (regs.launch_dma.multi_line_enable) { - UNIMPLEMENTED_IF(remap_enabled); - - // Perform a line-by-line copy. - // We're going to take a subrect of size (line_length_in, line_count) from the source - // rectangle. There is no need to manually flush/invalidate the regions because CopyBlock - // does that for us. - for (u32 line = 0; line < regs.line_count; ++line) { - const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in; - const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out; - memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in); - } - return; - } - // TODO: allow multisized components. - auto& accelerate = rasterizer->AccessAccelerateDMA(); - const bool is_const_a_dst = regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A; - const bool is_buffer_clear = remap_enabled && is_const_a_dst; - if (is_buffer_clear) { - ASSERT(regs.remap_const.component_size_minus_one == 3); - accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value); - std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value); - memory_manager.WriteBlockUnsafe(regs.offset_out, reinterpret_cast<u8*>(tmp_buffer.data()), - regs.line_length_in * sizeof(u32)); - return; - } - UNIMPLEMENTED_IF(remap_enabled); - if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { - std::vector<u8> tmp_buffer(regs.line_length_in); - memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in); - memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in); - } + ReleaseSemaphore(); } void MaxwellDMA::CopyBlockLinearToPitch() { diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index bc48320ce..953e34adc 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -219,8 +219,6 @@ private: /// registers. void Launch(); - void CopyPitchToPitch(); - void CopyBlockLinearToPitch(); void CopyPitchToBlockLinear(); diff --git a/src/video_core/host1x/syncpoint_manager.cpp b/src/video_core/host1x/syncpoint_manager.cpp index 326e8355a..a44fc83d3 100644 --- a/src/video_core/host1x/syncpoint_manager.cpp +++ b/src/video_core/host1x/syncpoint_manager.cpp @@ -36,7 +36,17 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction( void SyncpointManager::DeregisterAction(std::list<RegisteredAction>& action_storage, ActionHandle& handle) { std::unique_lock lk(guard); - action_storage.erase(handle); + + // We want to ensure the iterator still exists prior to erasing it + // Otherwise, if an invalid iterator was passed in then it could lead to UB + // It is important to avoid UB in that case since the deregister isn't called from a locked + // context + for (auto it = action_storage.begin(); it != action_storage.end(); it++) { + if (it == handle) { + action_storage.erase(it); + return; + } + } } void SyncpointManager::DeregisterGuestAction(u32 syncpoint_id, ActionHandle& handle) { diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index cca401c74..d07b21bd6 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -41,7 +41,11 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64 big_entries.resize(big_page_table_size / 32, 0); big_page_table_cpu.resize(big_page_table_size); big_page_continous.resize(big_page_table_size / continous_bits, 0); + std::array<PTEKind, 32> kind_valus; + kind_valus.fill(PTEKind::INVALID); + big_kinds.resize(big_page_table_size / 32, kind_valus); entries.resize(page_table_size / 32, 0); + kinds.resize(big_page_table_size / 32, kind_valus); } MemoryManager::~MemoryManager() = default; @@ -78,6 +82,41 @@ void MemoryManager::SetEntry(size_t position, MemoryManager::EntryType entry) { } } +PTEKind MemoryManager::GetPageKind(GPUVAddr gpu_addr) const { + auto entry = GetEntry<true>(gpu_addr); + if (entry == EntryType::Mapped || entry == EntryType::Reserved) [[likely]] { + return GetKind<true>(gpu_addr); + } else { + return GetKind<false>(gpu_addr); + } +} + +template <bool is_big_page> +PTEKind MemoryManager::GetKind(size_t position) const { + if constexpr (is_big_page) { + position = position >> big_page_bits; + const size_t sub_index = position % 32; + return big_kinds[position / 32][sub_index]; + } else { + position = position >> page_bits; + const size_t sub_index = position % 32; + return kinds[position / 32][sub_index]; + } +} + +template <bool is_big_page> +void MemoryManager::SetKind(size_t position, PTEKind kind) { + if constexpr (is_big_page) { + position = position >> big_page_bits; + const size_t sub_index = position % 32; + big_kinds[position / 32][sub_index] = kind; + } else { + position = position >> page_bits; + const size_t sub_index = position % 32; + kinds[position / 32][sub_index] = kind; + } +} + inline bool MemoryManager::IsBigPageContinous(size_t big_page_index) const { const u64 entry_mask = big_page_continous[big_page_index / continous_bits]; const size_t sub_index = big_page_index % continous_bits; @@ -92,8 +131,8 @@ inline void MemoryManager::SetBigPageContinous(size_t big_page_index, bool value } template <MemoryManager::EntryType entry_type> -GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cpu_addr, - size_t size) { +GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cpu_addr, size_t size, + PTEKind kind) { u64 remaining_size{size}; if constexpr (entry_type == EntryType::Mapped) { page_table.ReserveRange(gpu_addr, size); @@ -102,6 +141,7 @@ GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cp const GPUVAddr current_gpu_addr = gpu_addr + offset; [[maybe_unused]] const auto current_entry_type = GetEntry<false>(current_gpu_addr); SetEntry<false>(current_gpu_addr, entry_type); + SetKind<false>(current_gpu_addr, kind); if (current_entry_type != entry_type) { rasterizer->ModifyGPUMemory(unique_identifier, gpu_addr, page_size); } @@ -118,12 +158,13 @@ GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cp template <MemoryManager::EntryType entry_type> GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cpu_addr, - size_t size) { + size_t size, PTEKind kind) { u64 remaining_size{size}; for (u64 offset{}; offset < size; offset += big_page_size) { const GPUVAddr current_gpu_addr = gpu_addr + offset; [[maybe_unused]] const auto current_entry_type = GetEntry<true>(current_gpu_addr); SetEntry<true>(current_gpu_addr, entry_type); + SetKind<true>(current_gpu_addr, kind); if (current_entry_type != entry_type) { rasterizer->ModifyGPUMemory(unique_identifier, gpu_addr, big_page_size); } @@ -159,19 +200,19 @@ void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) rasterizer = rasterizer_; } -GPUVAddr MemoryManager::Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, +GPUVAddr MemoryManager::Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, PTEKind kind, bool is_big_pages) { if (is_big_pages) [[likely]] { - return BigPageTableOp<EntryType::Mapped>(gpu_addr, cpu_addr, size); + return BigPageTableOp<EntryType::Mapped>(gpu_addr, cpu_addr, size, kind); } - return PageTableOp<EntryType::Mapped>(gpu_addr, cpu_addr, size); + return PageTableOp<EntryType::Mapped>(gpu_addr, cpu_addr, size, kind); } GPUVAddr MemoryManager::MapSparse(GPUVAddr gpu_addr, std::size_t size, bool is_big_pages) { if (is_big_pages) [[likely]] { - return BigPageTableOp<EntryType::Reserved>(gpu_addr, 0, size); + return BigPageTableOp<EntryType::Reserved>(gpu_addr, 0, size, PTEKind::INVALID); } - return PageTableOp<EntryType::Reserved>(gpu_addr, 0, size); + return PageTableOp<EntryType::Reserved>(gpu_addr, 0, size, PTEKind::INVALID); } void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { @@ -188,8 +229,8 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { rasterizer->UnmapMemory(*cpu_addr, map_size); } - BigPageTableOp<EntryType::Free>(gpu_addr, 0, size); - PageTableOp<EntryType::Free>(gpu_addr, 0, size); + BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID); + PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID); } std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const { diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index f992e29f3..ab4bc9ec6 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -11,6 +11,7 @@ #include "common/common_types.h" #include "common/multi_level_page_table.h" #include "common/virtual_buffer.h" +#include "video_core/pte_kind.h" namespace VideoCore { class RasterizerInterface; @@ -98,7 +99,8 @@ public: std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr, std::size_t size) const; - GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, bool is_big_pages = true); + GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, + PTEKind kind = PTEKind::INVALID, bool is_big_pages = true); GPUVAddr MapSparse(GPUVAddr gpu_addr, std::size_t size, bool is_big_pages = true); void Unmap(GPUVAddr gpu_addr, std::size_t size); @@ -114,6 +116,8 @@ public: return gpu_addr < address_space_size; } + PTEKind GetPageKind(GPUVAddr gpu_addr) const; + private: template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped> inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped, @@ -166,10 +170,12 @@ private: std::vector<u64> big_entries; template <EntryType entry_type> - GPUVAddr PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cpu_addr, size_t size); + GPUVAddr PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cpu_addr, size_t size, + PTEKind kind); template <EntryType entry_type> - GPUVAddr BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cpu_addr, size_t size); + GPUVAddr BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cpu_addr, size_t size, + PTEKind kind); template <bool is_big_page> inline EntryType GetEntry(size_t position) const; @@ -177,6 +183,15 @@ private: template <bool is_big_page> inline void SetEntry(size_t position, EntryType entry); + std::vector<std::array<PTEKind, 32>> kinds; + std::vector<std::array<PTEKind, 32>> big_kinds; + + template <bool is_big_page> + inline PTEKind GetKind(size_t position) const; + + template <bool is_big_page> + inline void SetKind(size_t position, PTEKind kind); + Common::MultiLevelPageTable<u32> page_table; Common::VirtualBuffer<u32> big_page_table_cpu; diff --git a/src/video_core/pte_kind.h b/src/video_core/pte_kind.h new file mode 100644 index 000000000..591d7214b --- /dev/null +++ b/src/video_core/pte_kind.h @@ -0,0 +1,264 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/common_types.h" + +namespace Tegra { + +// https://github.com/NVIDIA/open-gpu-doc/blob/master/manuals/volta/gv100/dev_mmu.ref.txt +enum class PTEKind : u8 { + INVALID = 0xff, + PITCH = 0x00, + Z16 = 0x01, + Z16_2C = 0x02, + Z16_MS2_2C = 0x03, + Z16_MS4_2C = 0x04, + Z16_MS8_2C = 0x05, + Z16_MS16_2C = 0x06, + Z16_2Z = 0x07, + Z16_MS2_2Z = 0x08, + Z16_MS4_2Z = 0x09, + Z16_MS8_2Z = 0x0a, + Z16_MS16_2Z = 0x0b, + Z16_2CZ = 0x36, + Z16_MS2_2CZ = 0x37, + Z16_MS4_2CZ = 0x38, + Z16_MS8_2CZ = 0x39, + Z16_MS16_2CZ = 0x5f, + Z16_4CZ = 0x0c, + Z16_MS2_4CZ = 0x0d, + Z16_MS4_4CZ = 0x0e, + Z16_MS8_4CZ = 0x0f, + Z16_MS16_4CZ = 0x10, + S8Z24 = 0x11, + S8Z24_1Z = 0x12, + S8Z24_MS2_1Z = 0x13, + S8Z24_MS4_1Z = 0x14, + S8Z24_MS8_1Z = 0x15, + S8Z24_MS16_1Z = 0x16, + S8Z24_2CZ = 0x17, + S8Z24_MS2_2CZ = 0x18, + S8Z24_MS4_2CZ = 0x19, + S8Z24_MS8_2CZ = 0x1a, + S8Z24_MS16_2CZ = 0x1b, + S8Z24_2CS = 0x1c, + S8Z24_MS2_2CS = 0x1d, + S8Z24_MS4_2CS = 0x1e, + S8Z24_MS8_2CS = 0x1f, + S8Z24_MS16_2CS = 0x20, + S8Z24_4CSZV = 0x21, + S8Z24_MS2_4CSZV = 0x22, + S8Z24_MS4_4CSZV = 0x23, + S8Z24_MS8_4CSZV = 0x24, + S8Z24_MS16_4CSZV = 0x25, + V8Z24_MS4_VC12 = 0x26, + V8Z24_MS4_VC4 = 0x27, + V8Z24_MS8_VC8 = 0x28, + V8Z24_MS8_VC24 = 0x29, + V8Z24_MS4_VC12_1ZV = 0x2e, + V8Z24_MS4_VC4_1ZV = 0x2f, + V8Z24_MS8_VC8_1ZV = 0x30, + V8Z24_MS8_VC24_1ZV = 0x31, + V8Z24_MS4_VC12_2CS = 0x32, + V8Z24_MS4_VC4_2CS = 0x33, + V8Z24_MS8_VC8_2CS = 0x34, + V8Z24_MS8_VC24_2CS = 0x35, + V8Z24_MS4_VC12_2CZV = 0x3a, + V8Z24_MS4_VC4_2CZV = 0x3b, + V8Z24_MS8_VC8_2CZV = 0x3c, + V8Z24_MS8_VC24_2CZV = 0x3d, + V8Z24_MS4_VC12_2ZV = 0x3e, + V8Z24_MS4_VC4_2ZV = 0x3f, + V8Z24_MS8_VC8_2ZV = 0x40, + V8Z24_MS8_VC24_2ZV = 0x41, + V8Z24_MS4_VC12_4CSZV = 0x42, + V8Z24_MS4_VC4_4CSZV = 0x43, + V8Z24_MS8_VC8_4CSZV = 0x44, + V8Z24_MS8_VC24_4CSZV = 0x45, + Z24S8 = 0x46, + Z24S8_1Z = 0x47, + Z24S8_MS2_1Z = 0x48, + Z24S8_MS4_1Z = 0x49, + Z24S8_MS8_1Z = 0x4a, + Z24S8_MS16_1Z = 0x4b, + Z24S8_2CS = 0x4c, + Z24S8_MS2_2CS = 0x4d, + Z24S8_MS4_2CS = 0x4e, + Z24S8_MS8_2CS = 0x4f, + Z24S8_MS16_2CS = 0x50, + Z24S8_2CZ = 0x51, + Z24S8_MS2_2CZ = 0x52, + Z24S8_MS4_2CZ = 0x53, + Z24S8_MS8_2CZ = 0x54, + Z24S8_MS16_2CZ = 0x55, + Z24S8_4CSZV = 0x56, + Z24S8_MS2_4CSZV = 0x57, + Z24S8_MS4_4CSZV = 0x58, + Z24S8_MS8_4CSZV = 0x59, + Z24S8_MS16_4CSZV = 0x5a, + Z24V8_MS4_VC12 = 0x5b, + Z24V8_MS4_VC4 = 0x5c, + Z24V8_MS8_VC8 = 0x5d, + Z24V8_MS8_VC24 = 0x5e, + YUV_B8C1_2Y = 0x60, + YUV_B8C2_2Y = 0x61, + YUV_B10C1_2Y = 0x62, + YUV_B10C2_2Y = 0x6b, + YUV_B12C1_2Y = 0x6c, + YUV_B12C2_2Y = 0x6d, + Z24V8_MS4_VC12_1ZV = 0x63, + Z24V8_MS4_VC4_1ZV = 0x64, + Z24V8_MS8_VC8_1ZV = 0x65, + Z24V8_MS8_VC24_1ZV = 0x66, + Z24V8_MS4_VC12_2CS = 0x67, + Z24V8_MS4_VC4_2CS = 0x68, + Z24V8_MS8_VC8_2CS = 0x69, + Z24V8_MS8_VC24_2CS = 0x6a, + Z24V8_MS4_VC12_2CZV = 0x6f, + Z24V8_MS4_VC4_2CZV = 0x70, + Z24V8_MS8_VC8_2CZV = 0x71, + Z24V8_MS8_VC24_2CZV = 0x72, + Z24V8_MS4_VC12_2ZV = 0x73, + Z24V8_MS4_VC4_2ZV = 0x74, + Z24V8_MS8_VC8_2ZV = 0x75, + Z24V8_MS8_VC24_2ZV = 0x76, + Z24V8_MS4_VC12_4CSZV = 0x77, + Z24V8_MS4_VC4_4CSZV = 0x78, + Z24V8_MS8_VC8_4CSZV = 0x79, + Z24V8_MS8_VC24_4CSZV = 0x7a, + ZF32 = 0x7b, + ZF32_1Z = 0x7c, + ZF32_MS2_1Z = 0x7d, + ZF32_MS4_1Z = 0x7e, + ZF32_MS8_1Z = 0x7f, + ZF32_MS16_1Z = 0x80, + ZF32_2CS = 0x81, + ZF32_MS2_2CS = 0x82, + ZF32_MS4_2CS = 0x83, + ZF32_MS8_2CS = 0x84, + ZF32_MS16_2CS = 0x85, + ZF32_2CZ = 0x86, + ZF32_MS2_2CZ = 0x87, + ZF32_MS4_2CZ = 0x88, + ZF32_MS8_2CZ = 0x89, + ZF32_MS16_2CZ = 0x8a, + X8Z24_X16V8S8_MS4_VC12 = 0x8b, + X8Z24_X16V8S8_MS4_VC4 = 0x8c, + X8Z24_X16V8S8_MS8_VC8 = 0x8d, + X8Z24_X16V8S8_MS8_VC24 = 0x8e, + X8Z24_X16V8S8_MS4_VC12_1CS = 0x8f, + X8Z24_X16V8S8_MS4_VC4_1CS = 0x90, + X8Z24_X16V8S8_MS8_VC8_1CS = 0x91, + X8Z24_X16V8S8_MS8_VC24_1CS = 0x92, + X8Z24_X16V8S8_MS4_VC12_1ZV = 0x97, + X8Z24_X16V8S8_MS4_VC4_1ZV = 0x98, + X8Z24_X16V8S8_MS8_VC8_1ZV = 0x99, + X8Z24_X16V8S8_MS8_VC24_1ZV = 0x9a, + X8Z24_X16V8S8_MS4_VC12_1CZV = 0x9b, + X8Z24_X16V8S8_MS4_VC4_1CZV = 0x9c, + X8Z24_X16V8S8_MS8_VC8_1CZV = 0x9d, + X8Z24_X16V8S8_MS8_VC24_1CZV = 0x9e, + X8Z24_X16V8S8_MS4_VC12_2CS = 0x9f, + X8Z24_X16V8S8_MS4_VC4_2CS = 0xa0, + X8Z24_X16V8S8_MS8_VC8_2CS = 0xa1, + X8Z24_X16V8S8_MS8_VC24_2CS = 0xa2, + X8Z24_X16V8S8_MS4_VC12_2CSZV = 0xa3, + X8Z24_X16V8S8_MS4_VC4_2CSZV = 0xa4, + X8Z24_X16V8S8_MS8_VC8_2CSZV = 0xa5, + X8Z24_X16V8S8_MS8_VC24_2CSZV = 0xa6, + ZF32_X16V8S8_MS4_VC12 = 0xa7, + ZF32_X16V8S8_MS4_VC4 = 0xa8, + ZF32_X16V8S8_MS8_VC8 = 0xa9, + ZF32_X16V8S8_MS8_VC24 = 0xaa, + ZF32_X16V8S8_MS4_VC12_1CS = 0xab, + ZF32_X16V8S8_MS4_VC4_1CS = 0xac, + ZF32_X16V8S8_MS8_VC8_1CS = 0xad, + ZF32_X16V8S8_MS8_VC24_1CS = 0xae, + ZF32_X16V8S8_MS4_VC12_1ZV = 0xb3, + ZF32_X16V8S8_MS4_VC4_1ZV = 0xb4, + ZF32_X16V8S8_MS8_VC8_1ZV = 0xb5, + ZF32_X16V8S8_MS8_VC24_1ZV = 0xb6, + ZF32_X16V8S8_MS4_VC12_1CZV = 0xb7, + ZF32_X16V8S8_MS4_VC4_1CZV = 0xb8, + ZF32_X16V8S8_MS8_VC8_1CZV = 0xb9, + ZF32_X16V8S8_MS8_VC24_1CZV = 0xba, + ZF32_X16V8S8_MS4_VC12_2CS = 0xbb, + ZF32_X16V8S8_MS4_VC4_2CS = 0xbc, + ZF32_X16V8S8_MS8_VC8_2CS = 0xbd, + ZF32_X16V8S8_MS8_VC24_2CS = 0xbe, + ZF32_X16V8S8_MS4_VC12_2CSZV = 0xbf, + ZF32_X16V8S8_MS4_VC4_2CSZV = 0xc0, + ZF32_X16V8S8_MS8_VC8_2CSZV = 0xc1, + ZF32_X16V8S8_MS8_VC24_2CSZV = 0xc2, + ZF32_X24S8 = 0xc3, + ZF32_X24S8_1CS = 0xc4, + ZF32_X24S8_MS2_1CS = 0xc5, + ZF32_X24S8_MS4_1CS = 0xc6, + ZF32_X24S8_MS8_1CS = 0xc7, + ZF32_X24S8_MS16_1CS = 0xc8, + ZF32_X24S8_2CSZV = 0xce, + ZF32_X24S8_MS2_2CSZV = 0xcf, + ZF32_X24S8_MS4_2CSZV = 0xd0, + ZF32_X24S8_MS8_2CSZV = 0xd1, + ZF32_X24S8_MS16_2CSZV = 0xd2, + ZF32_X24S8_2CS = 0xd3, + ZF32_X24S8_MS2_2CS = 0xd4, + ZF32_X24S8_MS4_2CS = 0xd5, + ZF32_X24S8_MS8_2CS = 0xd6, + ZF32_X24S8_MS16_2CS = 0xd7, + S8 = 0x2a, + S8_2S = 0x2b, + GENERIC_16BX2 = 0xfe, + C32_2C = 0xd8, + C32_2CBR = 0xd9, + C32_2CBA = 0xda, + C32_2CRA = 0xdb, + C32_2BRA = 0xdc, + C32_MS2_2C = 0xdd, + C32_MS2_2CBR = 0xde, + C32_MS2_4CBRA = 0xcc, + C32_MS4_2C = 0xdf, + C32_MS4_2CBR = 0xe0, + C32_MS4_2CBA = 0xe1, + C32_MS4_2CRA = 0xe2, + C32_MS4_2BRA = 0xe3, + C32_MS4_4CBRA = 0x2c, + C32_MS8_MS16_2C = 0xe4, + C32_MS8_MS16_2CRA = 0xe5, + C64_2C = 0xe6, + C64_2CBR = 0xe7, + C64_2CBA = 0xe8, + C64_2CRA = 0xe9, + C64_2BRA = 0xea, + C64_MS2_2C = 0xeb, + C64_MS2_2CBR = 0xec, + C64_MS2_4CBRA = 0xcd, + C64_MS4_2C = 0xed, + C64_MS4_2CBR = 0xee, + C64_MS4_2CBA = 0xef, + C64_MS4_2CRA = 0xf0, + C64_MS4_2BRA = 0xf1, + C64_MS4_4CBRA = 0x2d, + C64_MS8_MS16_2C = 0xf2, + C64_MS8_MS16_2CRA = 0xf3, + C128_2C = 0xf4, + C128_2CR = 0xf5, + C128_MS2_2C = 0xf6, + C128_MS2_2CR = 0xf7, + C128_MS4_2C = 0xf8, + C128_MS4_2CR = 0xf9, + C128_MS8_MS16_2C = 0xfa, + C128_MS8_MS16_2CR = 0xfb, + X8C24 = 0xfc, + PITCH_NO_SWIZZLE = 0xfd, + SMSKED_MESSAGE = 0xca, + SMHOST_MESSAGE = 0xcb, +}; + +constexpr bool IsPitchKind(PTEKind kind) { + return kind == PTEKind::PITCH || kind == PTEKind::PITCH_NO_SWIZZLE; +} + +} // namespace Tegra diff --git a/src/video_core/renderer_base.cpp b/src/video_core/renderer_base.cpp index 45791aa75..e8761a747 100644 --- a/src/video_core/renderer_base.cpp +++ b/src/video_core/renderer_base.cpp @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: 2015 Citra Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include <thread> + #include "common/logging/log.h" #include "core/frontend/emu_window.h" #include "video_core/renderer_base.h" @@ -35,8 +37,12 @@ void RendererBase::RequestScreenshot(void* data, std::function<void(bool)> callb LOG_ERROR(Render, "A screenshot is already requested or in progress, ignoring the request"); return; } + auto async_callback{[callback = std::move(callback)](bool invert_y) { + std::thread t{callback, invert_y}; + t.detach(); + }}; renderer_settings.screenshot_bits = data; - renderer_settings.screenshot_complete_callback = std::move(callback); + renderer_settings.screenshot_complete_callback = async_callback; renderer_settings.screenshot_framebuffer_layout = layout; renderer_settings.screenshot_requested = true; } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index cce00cea8..e5c09a969 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -658,8 +658,13 @@ void RasterizerOpenGL::SyncDepthClamp() { } flags[Dirty::DepthClampEnabled] = false; - oglEnable(GL_DEPTH_CLAMP, maxwell3d->regs.viewport_clip_control.geometry_clip != - Maxwell::ViewportClipControl::GeometryClip::Passthrough); + bool depth_clamp_disabled{maxwell3d->regs.viewport_clip_control.geometry_clip == + Maxwell::ViewportClipControl::GeometryClip::Passthrough || + maxwell3d->regs.viewport_clip_control.geometry_clip == + Maxwell::ViewportClipControl::GeometryClip::FrustumXYZ || + maxwell3d->regs.viewport_clip_control.geometry_clip == + Maxwell::ViewportClipControl::GeometryClip::FrustumZ}; + oglEnable(GL_DEPTH_CLAMP, !depth_clamp_disabled); } void RasterizerOpenGL::SyncClipEnabled(u32 clip_mask) { @@ -746,19 +751,19 @@ void RasterizerOpenGL::SyncStencilTestState() { oglEnable(GL_STENCIL_TEST, regs.stencil_enable); glStencilFuncSeparate(GL_FRONT, MaxwellToGL::ComparisonOp(regs.stencil_front_op.func), - regs.stencil_front_func.ref, regs.stencil_front_func.func_mask); + regs.stencil_front_ref, regs.stencil_front_func_mask); glStencilOpSeparate(GL_FRONT, MaxwellToGL::StencilOp(regs.stencil_front_op.fail), MaxwellToGL::StencilOp(regs.stencil_front_op.zfail), MaxwellToGL::StencilOp(regs.stencil_front_op.zpass)); - glStencilMaskSeparate(GL_FRONT, regs.stencil_front_func.mask); + glStencilMaskSeparate(GL_FRONT, regs.stencil_front_mask); if (regs.stencil_two_side_enable) { glStencilFuncSeparate(GL_BACK, MaxwellToGL::ComparisonOp(regs.stencil_back_op.func), - regs.stencil_back_func.ref, regs.stencil_back_func.mask); + regs.stencil_back_ref, regs.stencil_back_mask); glStencilOpSeparate(GL_BACK, MaxwellToGL::StencilOp(regs.stencil_back_op.fail), MaxwellToGL::StencilOp(regs.stencil_back_op.zfail), MaxwellToGL::StencilOp(regs.stencil_back_op.zpass)); - glStencilMaskSeparate(GL_BACK, regs.stencil_back_func.mask); + glStencilMaskSeparate(GL_BACK, regs.stencil_back_mask); } else { glStencilFuncSeparate(GL_BACK, GL_ALWAYS, 0, 0xFFFFFFFF); glStencilOpSeparate(GL_BACK, GL_KEEP, GL_KEEP, GL_KEEP); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 6bdb0b645..609f0a772 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -317,8 +317,8 @@ GraphicsPipeline* ShaderCache::CurrentGraphicsPipeline() { graphics_key.tessellation_primitive.Assign(regs.tessellation.params.domain_type.Value()); graphics_key.tessellation_spacing.Assign(regs.tessellation.params.spacing.Value()); graphics_key.tessellation_clockwise.Assign( - regs.tessellation.params.output_primitives.Value() != - Maxwell::Tessellation::OutputPrimitves::Triangles_CCW); + regs.tessellation.params.output_primitives.Value() == + Maxwell::Tessellation::OutputPrimitives::Triangles_CW); graphics_key.xfb_enabled.Assign(regs.transform_feedback_enabled != 0 ? 1 : 0); if (graphics_key.xfb_enabled) { SetXfbState(graphics_key.xfb_state, regs); diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp index e2c709aac..a359f96f1 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.cpp +++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp @@ -100,14 +100,12 @@ void SetupDirtyDepthTest(Tables& tables) { void SetupDirtyStencilTest(Tables& tables) { static constexpr std::array offsets = { - OFF(stencil_enable), OFF(stencil_front_op.func), - OFF(stencil_front_func.ref), OFF(stencil_front_func.func_mask), - OFF(stencil_front_op.fail), OFF(stencil_front_op.zfail), - OFF(stencil_front_op.zpass), OFF(stencil_front_func.mask), - OFF(stencil_two_side_enable), OFF(stencil_back_op.func), - OFF(stencil_back_func.ref), OFF(stencil_back_func.func_mask), - OFF(stencil_back_op.fail), OFF(stencil_back_op.zfail), - OFF(stencil_back_op.zpass), OFF(stencil_back_func.mask)}; + OFF(stencil_enable), OFF(stencil_front_op.func), OFF(stencil_front_ref), + OFF(stencil_front_func_mask), OFF(stencil_front_op.fail), OFF(stencil_front_op.zfail), + OFF(stencil_front_op.zpass), OFF(stencil_front_mask), OFF(stencil_two_side_enable), + OFF(stencil_back_op.func), OFF(stencil_back_ref), OFF(stencil_back_func_mask), + OFF(stencil_back_op.fail), OFF(stencil_back_op.zfail), OFF(stencil_back_op.zpass), + OFF(stencil_back_mask)}; for (const auto offset : offsets) { tables[0][offset] = StencilTest; } diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp index eb7c22fd5..f85ed8e5b 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp @@ -63,14 +63,18 @@ void FixedPipelineState::Refresh(Tegra::Engines::Maxwell3D& maxwell3d, primitive_restart_enable.Assign(regs.primitive_restart.enabled != 0 ? 1 : 0); depth_bias_enable.Assign(enabled_lut[POLYGON_OFFSET_ENABLE_LUT[topology_index]] != 0 ? 1 : 0); depth_clamp_disabled.Assign(regs.viewport_clip_control.geometry_clip == - Maxwell::ViewportClipControl::GeometryClip::Passthrough); + Maxwell::ViewportClipControl::GeometryClip::Passthrough || + regs.viewport_clip_control.geometry_clip == + Maxwell::ViewportClipControl::GeometryClip::FrustumXYZ || + regs.viewport_clip_control.geometry_clip == + Maxwell::ViewportClipControl::GeometryClip::FrustumZ); ndc_minus_one_to_one.Assign(regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1 : 0); polygon_mode.Assign(PackPolygonMode(regs.polygon_mode_front)); patch_control_points_minus_one.Assign(regs.patch_vertices - 1); tessellation_primitive.Assign(static_cast<u32>(regs.tessellation.params.domain_type.Value())); tessellation_spacing.Assign(static_cast<u32>(regs.tessellation.params.spacing.Value())); - tessellation_clockwise.Assign(regs.tessellation.params.output_primitives.Value() != - Maxwell::Tessellation::OutputPrimitves::Triangles_CCW); + tessellation_clockwise.Assign(regs.tessellation.params.output_primitives.Value() == + Maxwell::Tessellation::OutputPrimitives::Triangles_CW); logic_op_enable.Assign(regs.logic_op.enable != 0 ? 1 : 0); logic_op.Assign(PackLogicOp(regs.logic_op.op)); topology.Assign(regs.draw.topology); diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 7cb02631c..4b15c0f85 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -59,10 +59,11 @@ void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { std::find_if(pools.begin(), pools.end(), [query_pool = query.first](vk::QueryPool& pool) { return query_pool == *pool; }); - ASSERT(it != std::end(pools)); - const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); - usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; + if (it != std::end(pools)) { + const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); + usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; + } } QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, const Device& device_, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 892cd94a3..47dfb45a1 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -772,11 +772,10 @@ void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs) if (regs.stencil_two_side_enable) { // Separate values per face scheduler.Record( - [front_ref = regs.stencil_front_func.ref, - front_write_mask = regs.stencil_front_func.mask, - front_test_mask = regs.stencil_front_func.func_mask, - back_ref = regs.stencil_back_func.ref, back_write_mask = regs.stencil_back_func.mask, - back_test_mask = regs.stencil_back_func.func_mask](vk::CommandBuffer cmdbuf) { + [front_ref = regs.stencil_front_ref, front_write_mask = regs.stencil_front_mask, + front_test_mask = regs.stencil_front_func_mask, back_ref = regs.stencil_back_ref, + back_write_mask = regs.stencil_back_mask, + back_test_mask = regs.stencil_back_func_mask](vk::CommandBuffer cmdbuf) { // Front face cmdbuf.SetStencilReference(VK_STENCIL_FACE_FRONT_BIT, front_ref); cmdbuf.SetStencilWriteMask(VK_STENCIL_FACE_FRONT_BIT, front_write_mask); @@ -789,9 +788,8 @@ void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs) }); } else { // Front face defines both faces - scheduler.Record([ref = regs.stencil_front_func.ref, - write_mask = regs.stencil_front_func.mask, - test_mask = regs.stencil_front_func.func_mask](vk::CommandBuffer cmdbuf) { + scheduler.Record([ref = regs.stencil_front_ref, write_mask = regs.stencil_front_mask, + test_mask = regs.stencil_front_func_mask](vk::CommandBuffer cmdbuf) { cmdbuf.SetStencilReference(VK_STENCIL_FACE_FRONT_AND_BACK, ref); cmdbuf.SetStencilWriteMask(VK_STENCIL_FACE_FRONT_AND_BACK, write_mask); cmdbuf.SetStencilCompareMask(VK_STENCIL_FACE_FRONT_AND_BACK, test_mask); diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 7fb256953..06f68d09a 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -26,39 +26,20 @@ using namespace Common::Literals; constexpr VkDeviceSize MAX_ALIGNMENT = 256; // Maximum size to put elements in the stream buffer constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8_MiB; +// Stream buffer size in bytes +constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128_MiB; +constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS; constexpr VkMemoryPropertyFlags HOST_FLAGS = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS; -static bool IsStreamHeap(VkMemoryHeap heap, size_t staging_buffer_size) noexcept { - return staging_buffer_size < (heap.size * 2) / 3; -} - -static bool HasLargeDeviceLocalHostVisibleMemory(const VkPhysicalDeviceMemoryProperties& props) { - const auto flags{VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT}; - - for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) { - const auto& memory_type{props.memoryTypes[type_index]}; - - if ((memory_type.propertyFlags & flags) != flags) { - // Memory must be device local and host visible - continue; - } - - const auto& heap{props.memoryHeaps[memory_type.heapIndex]}; - if (heap.size >= 7168_MiB) { - // This is the right type of memory - return true; - } - } - - return false; +bool IsStreamHeap(VkMemoryHeap heap) noexcept { + return STREAM_BUFFER_SIZE < (heap.size * 2) / 3; } std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, - VkMemoryPropertyFlags flags, - size_t staging_buffer_size) noexcept { + VkMemoryPropertyFlags flags) noexcept { for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) { if (((type_mask >> type_index) & 1) == 0) { // Memory type is incompatible @@ -69,7 +50,7 @@ std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& p // Memory type doesn't have the flags we want continue; } - if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex], staging_buffer_size)) { + if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) { // Memory heap is not suitable for streaming continue; } @@ -80,17 +61,17 @@ std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& p } u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, - bool try_device_local, size_t staging_buffer_size) { + bool try_device_local) { std::optional<u32> type; if (try_device_local) { // Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this - type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS, staging_buffer_size); + type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS); if (type) { return *type; } } // Otherwise try without the DEVICE_LOCAL_BIT - type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS, staging_buffer_size); + type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS); if (type) { return *type; } @@ -98,32 +79,20 @@ u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_ throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY); } -size_t Region(size_t iterator, size_t region_size) noexcept { - return iterator / region_size; +size_t Region(size_t iterator) noexcept { + return iterator / REGION_SIZE; } } // Anonymous namespace StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, Scheduler& scheduler_) : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { - - const auto memory_properties{device.GetPhysical().GetMemoryProperties().memoryProperties}; - if (HasLargeDeviceLocalHostVisibleMemory(memory_properties)) { - // Possible on many integrated and newer discrete cards - staging_buffer_size = 1_GiB; - } else { - // Well-supported default size used by most Vulkan PC games - staging_buffer_size = 256_MiB; - } - - region_size = staging_buffer_size / StagingBufferPool::NUM_SYNCS; - const vk::Device& dev = device.GetLogical(); stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = staging_buffer_size, + .size = STREAM_BUFFER_SIZE, .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, @@ -148,18 +117,19 @@ StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& mem .image = nullptr, .buffer = *stream_buffer, }; + const auto memory_properties = device.GetPhysical().GetMemoryProperties().memoryProperties; VkMemoryAllocateInfo stream_memory_info{ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .pNext = make_dedicated ? &dedicated_info : nullptr, .allocationSize = requirements.size, - .memoryTypeIndex = FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, true, - staging_buffer_size), + .memoryTypeIndex = + FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, true), }; stream_memory = dev.TryAllocateMemory(stream_memory_info); if (!stream_memory) { LOG_INFO(Render_Vulkan, "Dynamic memory allocation failed, trying with system memory"); - stream_memory_info.memoryTypeIndex = FindMemoryTypeIndex( - memory_properties, requirements.memoryTypeBits, false, staging_buffer_size); + stream_memory_info.memoryTypeIndex = + FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, false); stream_memory = dev.AllocateMemory(stream_memory_info); } @@ -167,7 +137,7 @@ StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& mem stream_memory.SetObjectNameEXT("Stream Buffer Memory"); } stream_buffer.BindMemory(*stream_memory, 0); - stream_pointer = stream_memory.Map(0, staging_buffer_size); + stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE); } StagingBufferPool::~StagingBufferPool() = default; @@ -188,25 +158,25 @@ void StagingBufferPool::TickFrame() { } StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { - if (AreRegionsActive(Region(free_iterator, region_size) + 1, - std::min(Region(iterator + size, region_size) + 1, NUM_SYNCS))) { + if (AreRegionsActive(Region(free_iterator) + 1, + std::min(Region(iterator + size) + 1, NUM_SYNCS))) { // Avoid waiting for the previous usages to be free return GetStagingBuffer(size, MemoryUsage::Upload); } const u64 current_tick = scheduler.CurrentTick(); - std::fill(sync_ticks.begin() + Region(used_iterator, region_size), - sync_ticks.begin() + Region(iterator, region_size), current_tick); + std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + Region(iterator), + current_tick); used_iterator = iterator; free_iterator = std::max(free_iterator, iterator + size); - if (iterator + size >= staging_buffer_size) { - std::fill(sync_ticks.begin() + Region(used_iterator, region_size), - sync_ticks.begin() + NUM_SYNCS, current_tick); + if (iterator + size >= STREAM_BUFFER_SIZE) { + std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS, + current_tick); used_iterator = 0; iterator = 0; free_iterator = size; - if (AreRegionsActive(0, Region(size, region_size) + 1)) { + if (AreRegionsActive(0, Region(size) + 1)) { // Avoid waiting for the previous usages to be free return GetStagingBuffer(size, MemoryUsage::Upload); } diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 90c67177f..91dc84da8 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -93,9 +93,6 @@ private: size_t free_iterator = 0; std::array<u64, NUM_SYNCS> sync_ticks{}; - size_t staging_buffer_size = 0; - size_t region_size = 0; - StagingBuffersCache device_local_cache; StagingBuffersCache upload_cache; StagingBuffersCache download_cache; diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp index ed98c8370..b87c3be66 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp +++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp @@ -77,12 +77,12 @@ void SetupDirtyDepthBounds(Tables& tables) { void SetupDirtyStencilProperties(Tables& tables) { auto& table = tables[0]; table[OFF(stencil_two_side_enable)] = StencilProperties; - table[OFF(stencil_front_func.ref)] = StencilProperties; - table[OFF(stencil_front_func.mask)] = StencilProperties; - table[OFF(stencil_front_func.func_mask)] = StencilProperties; - table[OFF(stencil_back_func.ref)] = StencilProperties; - table[OFF(stencil_back_func.mask)] = StencilProperties; - table[OFF(stencil_back_func.func_mask)] = StencilProperties; + table[OFF(stencil_front_ref)] = StencilProperties; + table[OFF(stencil_front_mask)] = StencilProperties; + table[OFF(stencil_front_func_mask)] = StencilProperties; + table[OFF(stencil_back_ref)] = StencilProperties; + table[OFF(stencil_back_mask)] = StencilProperties; + table[OFF(stencil_back_func_mask)] = StencilProperties; } void SetupDirtyLineWidth(Tables& tables) { diff --git a/src/video_core/texture_cache/descriptor_table.h b/src/video_core/texture_cache/descriptor_table.h index b18e3838f..ee4240288 100644 --- a/src/video_core/texture_cache/descriptor_table.h +++ b/src/video_core/texture_cache/descriptor_table.h @@ -18,7 +18,7 @@ class DescriptorTable { public: explicit DescriptorTable(Tegra::MemoryManager& gpu_memory_) : gpu_memory{gpu_memory_} {} - [[nodiscard]] bool Synchornize(GPUVAddr gpu_addr, u32 limit) { + [[nodiscard]] bool Synchronize(GPUVAddr gpu_addr, u32 limit) { [[likely]] if (current_gpu_addr == gpu_addr && current_limit == limit) { return false; } diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index ad935d386..08aa8ca33 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -150,6 +150,8 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, return PixelFormat::D24_UNORM_S8_UINT; case Hash(TextureFormat::D32S8, FLOAT, UINT, UNORM, UNORM, LINEAR): return PixelFormat::D32_FLOAT_S8_UINT; + case Hash(TextureFormat::R32_B24G8, FLOAT, UINT, UNORM, UNORM, LINEAR): + return PixelFormat::D32_FLOAT_S8_UINT; case Hash(TextureFormat::BC1_RGBA, UNORM, LINEAR): return PixelFormat::BC1_RGBA_UNORM; case Hash(TextureFormat::BC1_RGBA, UNORM, SRGB): diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 413baf730..0e0fd410f 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -193,11 +193,11 @@ void TextureCache<P>::SynchronizeGraphicsDescriptors() { const bool linked_tsc = maxwell3d->regs.sampler_binding == SamplerBinding::ViaHeaderBinding; const u32 tic_limit = maxwell3d->regs.tex_header.limit; const u32 tsc_limit = linked_tsc ? tic_limit : maxwell3d->regs.tex_sampler.limit; - if (channel_state->graphics_sampler_table.Synchornize(maxwell3d->regs.tex_sampler.Address(), + if (channel_state->graphics_sampler_table.Synchronize(maxwell3d->regs.tex_sampler.Address(), tsc_limit)) { channel_state->graphics_sampler_ids.resize(tsc_limit + 1, CORRUPT_ID); } - if (channel_state->graphics_image_table.Synchornize(maxwell3d->regs.tex_header.Address(), + if (channel_state->graphics_image_table.Synchronize(maxwell3d->regs.tex_header.Address(), tic_limit)) { channel_state->graphics_image_view_ids.resize(tic_limit + 1, CORRUPT_ID); } @@ -209,10 +209,10 @@ void TextureCache<P>::SynchronizeComputeDescriptors() { const u32 tic_limit = kepler_compute->regs.tic.limit; const u32 tsc_limit = linked_tsc ? tic_limit : kepler_compute->regs.tsc.limit; const GPUVAddr tsc_gpu_addr = kepler_compute->regs.tsc.Address(); - if (channel_state->compute_sampler_table.Synchornize(tsc_gpu_addr, tsc_limit)) { + if (channel_state->compute_sampler_table.Synchronize(tsc_gpu_addr, tsc_limit)) { channel_state->compute_sampler_ids.resize(tsc_limit + 1, CORRUPT_ID); } - if (channel_state->compute_image_table.Synchornize(kepler_compute->regs.tic.Address(), + if (channel_state->compute_image_table.Synchronize(kepler_compute->regs.tic.Address(), tic_limit)) { channel_state->compute_image_view_ids.resize(tic_limit + 1, CORRUPT_ID); } |