diff options
Diffstat (limited to 'src/video_core')
25 files changed, 312 insertions, 128 deletions
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index fe9fc0278..125c53360 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -385,6 +385,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { ProcessQueryGet(); break; } + case MAXWELL3D_REG_INDEX(condition.mode): { + ProcessQueryCondition(); + break; + } case MAXWELL3D_REG_INDEX(sync_info): { ProcessSyncPoint(); break; @@ -438,6 +442,7 @@ void Maxwell3D::ProcessQueryGet() { result = regs.query.query_sequence; break; default: + result = 1; UNIMPLEMENTED_MSG("Unimplemented query select type {}", static_cast<u32>(regs.query.query_get.select.Value())); } @@ -477,12 +482,52 @@ void Maxwell3D::ProcessQueryGet() { } } +void Maxwell3D::ProcessQueryCondition() { + const GPUVAddr condition_address{regs.condition.Address()}; + switch (regs.condition.mode) { + case Regs::ConditionMode::Always: { + execute_on = true; + break; + } + case Regs::ConditionMode::Never: { + execute_on = false; + break; + } + case Regs::ConditionMode::ResNonZero: { + Regs::QueryCompare cmp; + memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U; + break; + } + case Regs::ConditionMode::Equal: { + Regs::QueryCompare cmp; + memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + execute_on = + cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode; + break; + } + case Regs::ConditionMode::NotEqual: { + Regs::QueryCompare cmp; + memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + execute_on = + cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode; + break; + } + default: { + UNIMPLEMENTED_MSG("Uninplemented Condition Mode!"); + execute_on = true; + break; + } + } +} + void Maxwell3D::ProcessSyncPoint() { const u32 sync_point = regs.sync_info.sync_point.Value(); const u32 increment = regs.sync_info.increment.Value(); const u32 cache_flush = regs.sync_info.unknown.Value(); - LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment, - cache_flush); + if (increment) { + system.GPU().IncrementSyncPoint(sync_point); + } } void Maxwell3D::DrawArrays() { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index ac300bf76..1ee982b76 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -90,6 +90,20 @@ public: enum class QuerySelect : u32 { Zero = 0, + TimeElapsed = 2, + TransformFeedbackPrimitivesGenerated = 11, + PrimitivesGenerated = 18, + SamplesPassed = 21, + TransformFeedbackUnknown = 26, + }; + + struct QueryCompare { + u32 initial_sequence; + u32 initial_mode; + u32 unknown1; + u32 unknown2; + u32 current_sequence; + u32 current_mode; }; enum class QuerySyncCondition : u32 { @@ -97,6 +111,14 @@ public: GreaterThan = 1, }; + enum class ConditionMode : u32 { + Never = 0, + Always = 1, + ResNonZero = 2, + Equal = 3, + NotEqual = 4, + }; + enum class ShaderProgram : u32 { VertexA = 0, VertexB = 1, @@ -815,7 +837,18 @@ public: BitField<4, 1, u32> alpha_to_one; } multisample_control; - INSERT_PADDING_WORDS(0x7); + INSERT_PADDING_WORDS(0x4); + + struct { + u32 address_high; + u32 address_low; + ConditionMode mode; + + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + } condition; struct { u32 tsc_address_high; @@ -1223,6 +1256,10 @@ public: return macro_memory; } + bool ShouldExecute() const { + return execute_on; + } + private: void InitializeRegisterDefaults(); @@ -1257,6 +1294,8 @@ private: Upload::State upload_state; + bool execute_on{true}; + /// Retrieves information about a specific TIC entry from the TIC buffer. Texture::TICEntry GetTICEntry(u32 tic_index) const; @@ -1284,6 +1323,9 @@ private: /// Handles a write to the QUERY_GET register. void ProcessQueryGet(); + // Handles Conditional Rendering + void ProcessQueryCondition(); + /// Handles writes to syncing register. void ProcessSyncPoint(); @@ -1357,6 +1399,7 @@ ASSERT_REG_POSITION(clip_distance_enabled, 0x544); ASSERT_REG_POSITION(point_size, 0x546); ASSERT_REG_POSITION(zeta_enable, 0x54E); ASSERT_REG_POSITION(multisample_control, 0x54F); +ASSERT_REG_POSITION(condition, 0x554); ASSERT_REG_POSITION(tsc, 0x557); ASSERT_REG_POSITION(polygon_offset_factor, 0x55b); ASSERT_REG_POSITION(tic, 0x55D); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index b5f57e534..a28c04473 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -38,7 +38,7 @@ void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) { } void MaxwellDMA::HandleCopy() { - LOG_WARNING(HW_GPU, "Requested a DMA copy"); + LOG_TRACE(HW_GPU, "Requested a DMA copy"); const GPUVAddr source = regs.src_address.Address(); const GPUVAddr dest = regs.dst_address.Address(); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 8520a0143..083ee3304 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -560,6 +560,11 @@ union Instruction { BitField<48, 16, u64> opcode; union { + BitField<8, 5, ConditionCode> cc; + BitField<13, 1, u64> trigger; + } nop; + + union { BitField<8, 8, Register> gpr; BitField<20, 24, s64> offset; } gmem; @@ -1516,6 +1521,7 @@ public: TMML, // Texture Mip Map Level SUST, // Surface Store EXIT, + NOP, IPA, OUT_R, // Emit vertex/primitive ISBERD, @@ -1795,6 +1801,7 @@ private: INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"), INST("1101111101011---", Id::TMML, Type::Texture, "TMML"), INST("11101011001-----", Id::SUST, Type::Image, "SUST"), + INST("0101000010110---", Id::NOP, Type::Trivial, "NOP"), INST("11100000--------", Id::IPA, Type::Trivial, "IPA"), INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"), INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index e25754e37..1622332a4 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -29,7 +29,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) { UNREACHABLE(); } -GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} { +GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async) + : system{system}, renderer{renderer}, is_async{is_async} { auto& rasterizer{renderer.Rasterizer()}; memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer); dma_pusher = std::make_unique<Tegra::DmaPusher>(*this); @@ -74,6 +75,51 @@ const DmaPusher& GPU::DmaPusher() const { return *dma_pusher; } +void GPU::IncrementSyncPoint(const u32 syncpoint_id) { + syncpoints[syncpoint_id]++; + std::lock_guard lock{sync_mutex}; + if (!syncpt_interrupts[syncpoint_id].empty()) { + u32 value = syncpoints[syncpoint_id].load(); + auto it = syncpt_interrupts[syncpoint_id].begin(); + while (it != syncpt_interrupts[syncpoint_id].end()) { + if (value >= *it) { + TriggerCpuInterrupt(syncpoint_id, *it); + it = syncpt_interrupts[syncpoint_id].erase(it); + continue; + } + it++; + } + } +} + +u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const { + return syncpoints[syncpoint_id].load(); +} + +void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) { + auto& interrupt = syncpt_interrupts[syncpoint_id]; + bool contains = std::any_of(interrupt.begin(), interrupt.end(), + [value](u32 in_value) { return in_value == value; }); + if (contains) { + return; + } + syncpt_interrupts[syncpoint_id].emplace_back(value); +} + +bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { + std::lock_guard lock{sync_mutex}; + auto& interrupt = syncpt_interrupts[syncpoint_id]; + const auto iter = + std::find_if(interrupt.begin(), interrupt.end(), + [value](u32 interrupt_value) { return value == interrupt_value; }); + + if (iter == interrupt.end()) { + return false; + } + interrupt.erase(iter); + return true; +} + u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { ASSERT(format != RenderTargetFormat::NONE); @@ -151,12 +197,12 @@ enum class BufferMethods { NotifyIntr = 0x8, WrcacheFlush = 0x9, Unk28 = 0xA, - Unk2c = 0xB, + UnkCacheFlush = 0xB, RefCnt = 0x14, SemaphoreAcquire = 0x1A, SemaphoreRelease = 0x1B, - Unk70 = 0x1C, - Unk74 = 0x1D, + FenceValue = 0x1C, + FenceAction = 0x1D, Unk78 = 0x1E, Unk7c = 0x1F, Yield = 0x20, @@ -202,6 +248,10 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { case BufferMethods::SemaphoreAddressLow: case BufferMethods::SemaphoreSequence: case BufferMethods::RefCnt: + case BufferMethods::UnkCacheFlush: + case BufferMethods::WrcacheFlush: + case BufferMethods::FenceValue: + case BufferMethods::FenceAction: break; case BufferMethods::SemaphoreTrigger: { ProcessSemaphoreTriggerMethod(); @@ -212,21 +262,11 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented"); break; } - case BufferMethods::WrcacheFlush: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method WrcacheFlush not implemented"); - break; - } case BufferMethods::Unk28: { // TODO(Kmather73): Research and implement this method. LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented"); break; } - case BufferMethods::Unk2c: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method Unk2c not implemented"); - break; - } case BufferMethods::SemaphoreAcquire: { ProcessSemaphoreAcquire(); break; diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 0ace0ff4f..87c96f46b 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -5,8 +5,12 @@ #pragma once #include <array> +#include <atomic> +#include <list> #include <memory> +#include <mutex> #include "common/common_types.h" +#include "core/hle/service/nvdrv/nvdata.h" #include "core/hle/service/nvflinger/buffer_queue.h" #include "video_core/dma_pusher.h" @@ -127,7 +131,7 @@ class MemoryManager; class GPU { public: - explicit GPU(Core::System& system, VideoCore::RendererBase& renderer); + explicit GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async); virtual ~GPU(); @@ -170,6 +174,22 @@ public: /// Returns a reference to the GPU DMA pusher. Tegra::DmaPusher& DmaPusher(); + void IncrementSyncPoint(u32 syncpoint_id); + + u32 GetSyncpointValue(u32 syncpoint_id) const; + + void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value); + + bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); + + std::unique_lock<std::mutex> LockSync() { + return std::unique_lock{sync_mutex}; + } + + bool IsAsync() const { + return is_async; + } + /// Returns a const reference to the GPU DMA pusher. const Tegra::DmaPusher& DmaPusher() const; @@ -200,7 +220,12 @@ public: u32 semaphore_acquire; u32 semaphore_release; - INSERT_PADDING_WORDS(0xE4); + u32 fence_value; + union { + BitField<4, 4, u32> operation; + BitField<8, 8, u32> id; + } fence_action; + INSERT_PADDING_WORDS(0xE2); // Puller state u32 acquire_mode; @@ -234,6 +259,9 @@ public: /// Notify rasterizer that any caches of the specified region should be flushed and invalidated virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0; +protected: + virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0; + private: void ProcessBindMethod(const MethodCall& method_call); void ProcessSemaphoreTriggerMethod(); @@ -252,6 +280,7 @@ private: protected: std::unique_ptr<Tegra::DmaPusher> dma_pusher; VideoCore::RendererBase& renderer; + Core::System& system; private: std::unique_ptr<Tegra::MemoryManager> memory_manager; @@ -268,6 +297,14 @@ private: std::unique_ptr<Engines::MaxwellDMA> maxwell_dma; /// Inline memory engine std::unique_ptr<Engines::KeplerMemory> kepler_memory; + + std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{}; + + std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; + + std::mutex sync_mutex; + + const bool is_async; }; #define ASSERT_REG_POSITION(field_name, position) \ @@ -280,6 +317,8 @@ ASSERT_REG_POSITION(semaphore_trigger, 0x7); ASSERT_REG_POSITION(reference_count, 0x14); ASSERT_REG_POSITION(semaphore_acquire, 0x1A); ASSERT_REG_POSITION(semaphore_release, 0x1B); +ASSERT_REG_POSITION(fence_value, 0x1C); +ASSERT_REG_POSITION(fence_action, 0x1D); ASSERT_REG_POSITION(acquire_mode, 0x100); ASSERT_REG_POSITION(acquire_source, 0x101); diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp index d4e2553a9..ea67be831 100644 --- a/src/video_core/gpu_asynch.cpp +++ b/src/video_core/gpu_asynch.cpp @@ -2,6 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "core/core.h" +#include "core/hardware_interrupt_manager.h" #include "video_core/gpu_asynch.h" #include "video_core/gpu_thread.h" #include "video_core/renderer_base.h" @@ -9,7 +11,7 @@ namespace VideoCommon { GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer) - : GPU(system, renderer), gpu_thread{system} {} + : GPU(system, renderer, true), gpu_thread{system} {} GPUAsynch::~GPUAsynch() = default; @@ -38,4 +40,9 @@ void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { gpu_thread.FlushAndInvalidateRegion(addr, size); } +void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { + auto& interrupt_manager = system.InterruptManager(); + interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); +} + } // namespace VideoCommon diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h index 30be74cba..36377d677 100644 --- a/src/video_core/gpu_asynch.h +++ b/src/video_core/gpu_asynch.h @@ -27,6 +27,9 @@ public: void InvalidateRegion(CacheAddr addr, u64 size) override; void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override; +protected: + void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override; + private: GPUThread::ThreadManager gpu_thread; }; diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp index 45e43b1dc..d4ead9c47 100644 --- a/src/video_core/gpu_synch.cpp +++ b/src/video_core/gpu_synch.cpp @@ -8,7 +8,7 @@ namespace VideoCommon { GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer) - : GPU(system, renderer) {} + : GPU(system, renderer, false) {} GPUSynch::~GPUSynch() = default; diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h index 3031fcf72..07bcc47f1 100644 --- a/src/video_core/gpu_synch.h +++ b/src/video_core/gpu_synch.h @@ -25,6 +25,10 @@ public: void FlushRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override; void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override; + +protected: + void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id, + [[maybe_unused]] u32 value) const override {} }; } // namespace VideoCommon diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 3f0939ec9..b441e92b0 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -21,7 +21,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p MicroProfileOnThreadCreate("GpuThread"); // Wait for first GPU command before acquiring the window context - state.WaitForCommands(); + while (state.queue.Empty()) + ; // If emulation was stopped during disk shader loading, abort before trying to acquire context if (!state.is_running) { @@ -32,7 +33,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p CommandDataContainer next; while (state.is_running) { - state.WaitForCommands(); while (!state.queue.Empty()) { state.queue.Pop(next); if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) { @@ -49,8 +49,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p } else { UNREACHABLE(); } - state.signaled_fence = next.fence; - state.TrySynchronize(); + state.signaled_fence.store(next.fence); } } } @@ -89,12 +88,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) { } void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) { - if (state.queue.Empty()) { - // It's quicker to invalidate a single region on the CPU if the queue is already empty - system.Renderer().Rasterizer().InvalidateRegion(addr, size); - } else { - PushCommand(InvalidateRegionCommand(addr, size)); - } + system.Renderer().Rasterizer().InvalidateRegion(addr, size); } void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { @@ -105,22 +99,13 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { u64 ThreadManager::PushCommand(CommandData&& command_data) { const u64 fence{++state.last_fence}; state.queue.Push(CommandDataContainer(std::move(command_data), fence)); - state.SignalCommands(); return fence; } MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); void SynchState::WaitForSynchronization(u64 fence) { - if (signaled_fence >= fence) { - return; - } - - // Wait for the GPU to be idle (all commands to be executed) - { - MICROPROFILE_SCOPE(GPU_wait); - std::unique_lock lock{synchronization_mutex}; - synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; }); - } + while (signaled_fence.load() < fence) + ; } } // namespace VideoCommon::GPUThread diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 05a168a72..1d9d0c39e 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -88,41 +88,9 @@ struct CommandDataContainer { /// Struct used to synchronize the GPU thread struct SynchState final { std::atomic_bool is_running{true}; - std::atomic_int queued_frame_count{}; - std::mutex synchronization_mutex; - std::mutex commands_mutex; - std::condition_variable commands_condition; - std::condition_variable synchronization_condition; - - /// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU - /// synchronized. This is entirely empirical. - bool IsSynchronized() const { - constexpr std::size_t max_queue_gap{5}; - return queue.Size() <= max_queue_gap; - } - - void TrySynchronize() { - if (IsSynchronized()) { - std::lock_guard lock{synchronization_mutex}; - synchronization_condition.notify_one(); - } - } void WaitForSynchronization(u64 fence); - void SignalCommands() { - if (queue.Empty()) { - return; - } - - commands_condition.notify_one(); - } - - void WaitForCommands() { - std::unique_lock lock{commands_mutex}; - commands_condition.wait(lock, [this] { return !queue.Empty(); }); - } - using CommandQueue = Common::SPSCQueue<CommandDataContainer>; CommandQueue queue; u64 last_fence{}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index c59e687b6..c28ae795c 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -595,7 +595,13 @@ void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, boo } void RasterizerOpenGL::Clear() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& maxwell3d = system.GPU().Maxwell3D(); + + if (!maxwell3d.ShouldExecute()) { + return; + } + + const auto& regs = maxwell3d.regs; bool use_color{}; bool use_depth{}; bool use_stencil{}; @@ -697,6 +703,11 @@ void RasterizerOpenGL::DrawArrays() { MICROPROFILE_SCOPE(OpenGL_Drawing); auto& gpu = system.GPU().Maxwell3D(); + + if (!gpu.ShouldExecute()) { + return; + } + const auto& regs = gpu.regs; SyncColorMask(); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 8fcd39a69..408332f90 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -137,7 +137,6 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) { ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size()); const auto& format{tex_format_tuples[static_cast<std::size_t>(pixel_format)]}; - ASSERT(component_type == format.component_type); return format; } diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp index fdcc970ff..ec3a76690 100644 --- a/src/video_core/shader/control_flow.cpp +++ b/src/video_core/shader/control_flow.cpp @@ -15,7 +15,7 @@ #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { - +namespace { using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; @@ -29,8 +29,7 @@ struct Query { struct BlockStack { BlockStack() = default; - BlockStack(const BlockStack& b) = default; - BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {} + explicit BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {} std::stack<u32> ssy_stack{}; std::stack<u32> pbk_stack{}; }; @@ -58,7 +57,7 @@ struct BlockInfo { struct CFGRebuildState { explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size, const u32 start) - : program_code{program_code}, program_size{program_size}, start{start} {} + : start{start}, program_code{program_code}, program_size{program_size} {} u32 start{}; std::vector<BlockInfo> block_info{}; @@ -85,7 +84,7 @@ std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address) return {BlockCollision::Inside, index}; } } - return {BlockCollision::None, -1}; + return {BlockCollision::None, 0xFFFFFFFF}; } struct ParseInfo { @@ -365,27 +364,29 @@ bool TryQuery(CFGRebuildState& state) { const auto gather_end = labels.upper_bound(block.end); while (gather_start != gather_end) { cc.push(gather_start->second); - gather_start++; + ++gather_start; } }; if (state.queries.empty()) { return false; } + Query& q = state.queries.front(); const u32 block_index = state.registered[q.address]; BlockInfo& block = state.block_info[block_index]; - // If the block is visted, check if the stacks match, else gather the ssy/pbk + // If the block is visited, check if the stacks match, else gather the ssy/pbk // labels into the current stack and look if the branch at the end of the block // consumes a label. Schedule new queries accordingly if (block.visited) { BlockStack& stack = state.stacks[q.address]; - const bool all_okay = (stack.ssy_stack.size() == 0 || q.ssy_stack == stack.ssy_stack) && - (stack.pbk_stack.size() == 0 || q.pbk_stack == stack.pbk_stack); + const bool all_okay = (stack.ssy_stack.empty() || q.ssy_stack == stack.ssy_stack) && + (stack.pbk_stack.empty() || q.pbk_stack == stack.pbk_stack); state.queries.pop_front(); return all_okay; } block.visited = true; - state.stacks[q.address] = BlockStack{q}; + state.stacks.insert_or_assign(q.address, BlockStack{q}); + Query q2(q); state.queries.pop_front(); gather_labels(q2.ssy_stack, state.ssy_labels, block); @@ -394,6 +395,7 @@ bool TryQuery(CFGRebuildState& state) { q2.address = block.end + 1; state.queries.push_back(q2); } + Query conditional_query{q2}; if (block.branch.is_sync) { if (block.branch.address == unassigned_branch) { @@ -408,13 +410,15 @@ bool TryQuery(CFGRebuildState& state) { conditional_query.pbk_stack.pop(); } conditional_query.address = block.branch.address; - state.queries.push_back(conditional_query); + state.queries.push_back(std::move(conditional_query)); return true; } +} // Anonymous namespace -std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size, - u32 start_address) { +std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, + std::size_t program_size, u32 start_address) { CFGRebuildState state{program_code, program_size, start_address}; + // Inspect Code and generate blocks state.labels.clear(); state.labels.emplace(start_address); @@ -424,10 +428,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u return {}; } } + // Decompile Stacks - Query start_query{}; - start_query.address = state.start; - state.queries.push_back(start_query); + state.queries.push_back(Query{state.start, {}, {}}); bool decompiled = true; while (!state.queries.empty()) { if (!TryQuery(state)) { @@ -435,14 +438,15 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u break; } } + // Sort and organize results std::sort(state.block_info.begin(), state.block_info.end(), - [](const BlockInfo& a, const BlockInfo& b) -> bool { return a.start < b.start; }); + [](const BlockInfo& a, const BlockInfo& b) { return a.start < b.start; }); ShaderCharacteristics result_out{}; result_out.decompilable = decompiled; result_out.start = start_address; result_out.end = start_address; - for (auto& block : state.block_info) { + for (const auto& block : state.block_info) { ShaderBlock new_block{}; new_block.start = block.start; new_block.end = block.end; @@ -457,8 +461,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u } if (result_out.decompilable) { result_out.labels = std::move(state.labels); - return {result_out}; + return {std::move(result_out)}; } + // If it's not decompilable, merge the unlabelled blocks together auto back = result_out.blocks.begin(); auto next = std::next(back); @@ -469,8 +474,8 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u continue; } back = next; - next++; + ++next; } - return {result_out}; + return {std::move(result_out)}; } } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h index 5e8ea3271..b0a5e4f8c 100644 --- a/src/video_core/shader/control_flow.h +++ b/src/video_core/shader/control_flow.h @@ -4,7 +4,6 @@ #pragma once -#include <cstring> #include <list> #include <optional> #include <unordered_set> @@ -26,27 +25,44 @@ struct Condition { bool IsUnconditional() const { return predicate == Pred::UnusedIndex && cc == ConditionCode::T; } + bool operator==(const Condition& other) const { return std::tie(predicate, cc) == std::tie(other.predicate, other.cc); } + + bool operator!=(const Condition& other) const { + return !operator==(other); + } }; struct ShaderBlock { - u32 start{}; - u32 end{}; - bool ignore_branch{}; struct Branch { Condition cond{}; bool kills{}; s32 address{}; + bool operator==(const Branch& b) const { return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address); } - } branch{}; + + bool operator!=(const Branch& b) const { + return !operator==(b); + } + }; + + u32 start{}; + u32 end{}; + bool ignore_branch{}; + Branch branch{}; + bool operator==(const ShaderBlock& sb) const { return std::tie(start, end, ignore_branch, branch) == std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch); } + + bool operator!=(const ShaderBlock& sb) const { + return !operator==(sb); + } }; struct ShaderCharacteristics { @@ -57,7 +73,7 @@ struct ShaderCharacteristics { std::unordered_set<u32> labels{}; }; -std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size, - u32 start_address); +std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, + std::size_t program_size, u32 start_address); } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index afffd157f..b547d8323 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -47,14 +47,14 @@ void ShaderIR::Decode() { if (shader_info.decompilable) { disable_flow_stack = true; const auto insert_block = [this](NodeBlock& nodes, u32 label) { - if (label == exit_branch) { + if (label == static_cast<u32>(exit_branch)) { return; } basic_blocks.insert({label, nodes}); }; const auto& blocks = shader_info.blocks; NodeBlock current_block; - u32 current_label = exit_branch; + u32 current_label = static_cast<u32>(exit_branch); for (auto& block : blocks) { if (shader_info.labels.count(block.start) != 0) { insert_block(current_block, current_label); diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp index 87d8fecaa..1473c282a 100644 --- a/src/video_core/shader/decode/arithmetic.cpp +++ b/src/video_core/shader/decode/arithmetic.cpp @@ -42,11 +42,14 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { case OpCode::Id::FMUL_R: case OpCode::Id::FMUL_IMM: { // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit. - UNIMPLEMENTED_IF_MSG(instr.fmul.tab5cb8_2 != 0, "FMUL tab5cb8_2({}) is not implemented", - instr.fmul.tab5cb8_2.Value()); - UNIMPLEMENTED_IF_MSG( - instr.fmul.tab5c68_0 != 1, "FMUL tab5cb8_0({}) is not implemented", - instr.fmul.tab5c68_0.Value()); // SMO typical sends 1 here which seems to be the default + if (instr.fmul.tab5cb8_2 != 0) { + LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented", + instr.fmul.tab5cb8_2.Value()); + } + if (instr.fmul.tab5c68_0 != 1) { + LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented", + instr.fmul.tab5c68_0.Value()); + } op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b); diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp index 7bcf38f23..6466fc011 100644 --- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp +++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp @@ -23,7 +23,9 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) { LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); } } else { - UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None); + if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) { + LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + } } Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a); diff --git a/src/video_core/shader/decode/ffma.cpp b/src/video_core/shader/decode/ffma.cpp index 29be25ca3..ca2f39e8d 100644 --- a/src/video_core/shader/decode/ffma.cpp +++ b/src/video_core/shader/decode/ffma.cpp @@ -18,10 +18,12 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented"); - UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_0 != 1, "FFMA tab5980_0({}) not implemented", - instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO - UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_1 != 0, "FFMA tab5980_1({}) not implemented", - instr.ffma.tab5980_1.Value()); + if (instr.ffma.tab5980_0 != 1) { + LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value()); + } + if (instr.ffma.tab5980_1 != 0) { + LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value()); + } const Node op_a = GetRegister(instr.gpr8); diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp index ad180d6df..a6c082cc9 100644 --- a/src/video_core/shader/decode/half_set_predicate.cpp +++ b/src/video_core/shader/decode/half_set_predicate.cpp @@ -18,7 +18,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0); + DEBUG_ASSERT(instr.hsetp2.ftz == 0); Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a); op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a); @@ -52,15 +52,15 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { } const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op); - const Node pred39 = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred); + const Node combined_pred = GetPredicate(instr.hsetp2.pred3, instr.hsetp2.neg_pred); const auto Write = [&](u64 dest, Node src) { - SetPredicate(bb, dest, Operation(combiner, std::move(src), pred39)); + SetPredicate(bb, dest, Operation(combiner, std::move(src), combined_pred)); }; const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b); const u64 first = instr.hsetp2.pred0; - const u64 second = instr.hsetp2.pred3; + const u64 second = instr.hsetp2.pred39; if (h_and) { const Node joined = Operation(OperationCode::LogicalAnd2, comparison); Write(first, joined); diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp index c3bcf1ae9..5b44cb79c 100644 --- a/src/video_core/shader/decode/hfma2.cpp +++ b/src/video_core/shader/decode/hfma2.cpp @@ -22,9 +22,9 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); if (opcode->get().GetId() == OpCode::Id::HFMA2_RR) { - UNIMPLEMENTED_IF(instr.hfma2.rr.precision != HalfPrecision::None); + DEBUG_ASSERT(instr.hfma2.rr.precision == HalfPrecision::None); } else { - UNIMPLEMENTED_IF(instr.hfma2.precision != HalfPrecision::None); + DEBUG_ASSERT(instr.hfma2.precision == HalfPrecision::None); } constexpr auto identity = HalfType::H0_H1; diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index c0f64d7a0..ac0e764d6 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -22,6 +22,12 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); switch (opcode->get().GetId()) { + case OpCode::Id::NOP: { + UNIMPLEMENTED_IF(instr.nop.cc != Tegra::Shader::ConditionCode::T); + UNIMPLEMENTED_IF(instr.nop.trigger != 0); + // With the previous preconditions, this instruction is a no-operation. + break; + } case OpCode::Id::EXIT: { const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "EXIT condition code used: {}", diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index a53e02253..55f5949e4 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -59,8 +59,8 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co return TrackCbuf(source, code, new_cursor); } if (const auto operation = std::get_if<OperationNode>(&*tracked)) { - for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) { - if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) { + for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { + if (auto found = TrackCbuf((*operation)[i - 1], code, cursor); std::get<0>(found)) { // Cbuf found in operand. return found; } diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 6af9044ca..683c49207 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -24,9 +24,8 @@ StagingCache::StagingCache() = default; StagingCache::~StagingCache() = default; SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params) - : params{params}, mipmap_sizes(params.num_levels), - mipmap_offsets(params.num_levels), gpu_addr{gpu_addr}, host_memory_size{ - params.GetHostSizeInBytes()} { + : params{params}, host_memory_size{params.GetHostSizeInBytes()}, gpu_addr{gpu_addr}, + mipmap_sizes(params.num_levels), mipmap_offsets(params.num_levels) { std::size_t offset = 0; for (u32 level = 0; level < params.num_levels; ++level) { const std::size_t mipmap_size{params.GetGuestMipmapSize(level)}; |