summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/engines/maxwell_3d.cpp49
-rw-r--r--src/video_core/engines/maxwell_3d.h45
-rw-r--r--src/video_core/engines/maxwell_dma.cpp2
-rw-r--r--src/video_core/engines/shader_bytecode.h7
-rw-r--r--src/video_core/gpu.cpp68
-rw-r--r--src/video_core/gpu.h43
-rw-r--r--src/video_core/gpu_asynch.cpp9
-rw-r--r--src/video_core/gpu_asynch.h3
-rw-r--r--src/video_core/gpu_synch.cpp2
-rw-r--r--src/video_core/gpu_synch.h4
-rw-r--r--src/video_core/gpu_thread.cpp27
-rw-r--r--src/video_core/gpu_thread.h32
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp13
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp1
-rw-r--r--src/video_core/shader/control_flow.cpp47
-rw-r--r--src/video_core/shader/control_flow.h30
-rw-r--r--src/video_core/shader/decode.cpp4
-rw-r--r--src/video_core/shader/decode/arithmetic.cpp13
-rw-r--r--src/video_core/shader/decode/arithmetic_half_immediate.cpp4
-rw-r--r--src/video_core/shader/decode/ffma.cpp10
-rw-r--r--src/video_core/shader/decode/half_set_predicate.cpp8
-rw-r--r--src/video_core/shader/decode/hfma2.cpp4
-rw-r--r--src/video_core/shader/decode/other.cpp6
-rw-r--r--src/video_core/shader/track.cpp4
-rw-r--r--src/video_core/texture_cache/surface_base.cpp5
25 files changed, 312 insertions, 128 deletions
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index fe9fc0278..125c53360 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -385,6 +385,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
ProcessQueryGet();
break;
}
+ case MAXWELL3D_REG_INDEX(condition.mode): {
+ ProcessQueryCondition();
+ break;
+ }
case MAXWELL3D_REG_INDEX(sync_info): {
ProcessSyncPoint();
break;
@@ -438,6 +442,7 @@ void Maxwell3D::ProcessQueryGet() {
result = regs.query.query_sequence;
break;
default:
+ result = 1;
UNIMPLEMENTED_MSG("Unimplemented query select type {}",
static_cast<u32>(regs.query.query_get.select.Value()));
}
@@ -477,12 +482,52 @@ void Maxwell3D::ProcessQueryGet() {
}
}
+void Maxwell3D::ProcessQueryCondition() {
+ const GPUVAddr condition_address{regs.condition.Address()};
+ switch (regs.condition.mode) {
+ case Regs::ConditionMode::Always: {
+ execute_on = true;
+ break;
+ }
+ case Regs::ConditionMode::Never: {
+ execute_on = false;
+ break;
+ }
+ case Regs::ConditionMode::ResNonZero: {
+ Regs::QueryCompare cmp;
+ memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+ execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
+ break;
+ }
+ case Regs::ConditionMode::Equal: {
+ Regs::QueryCompare cmp;
+ memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+ execute_on =
+ cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
+ break;
+ }
+ case Regs::ConditionMode::NotEqual: {
+ Regs::QueryCompare cmp;
+ memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+ execute_on =
+ cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
+ break;
+ }
+ default: {
+ UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
+ execute_on = true;
+ break;
+ }
+ }
+}
+
void Maxwell3D::ProcessSyncPoint() {
const u32 sync_point = regs.sync_info.sync_point.Value();
const u32 increment = regs.sync_info.increment.Value();
const u32 cache_flush = regs.sync_info.unknown.Value();
- LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
- cache_flush);
+ if (increment) {
+ system.GPU().IncrementSyncPoint(sync_point);
+ }
}
void Maxwell3D::DrawArrays() {
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index ac300bf76..1ee982b76 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -90,6 +90,20 @@ public:
enum class QuerySelect : u32 {
Zero = 0,
+ TimeElapsed = 2,
+ TransformFeedbackPrimitivesGenerated = 11,
+ PrimitivesGenerated = 18,
+ SamplesPassed = 21,
+ TransformFeedbackUnknown = 26,
+ };
+
+ struct QueryCompare {
+ u32 initial_sequence;
+ u32 initial_mode;
+ u32 unknown1;
+ u32 unknown2;
+ u32 current_sequence;
+ u32 current_mode;
};
enum class QuerySyncCondition : u32 {
@@ -97,6 +111,14 @@ public:
GreaterThan = 1,
};
+ enum class ConditionMode : u32 {
+ Never = 0,
+ Always = 1,
+ ResNonZero = 2,
+ Equal = 3,
+ NotEqual = 4,
+ };
+
enum class ShaderProgram : u32 {
VertexA = 0,
VertexB = 1,
@@ -815,7 +837,18 @@ public:
BitField<4, 1, u32> alpha_to_one;
} multisample_control;
- INSERT_PADDING_WORDS(0x7);
+ INSERT_PADDING_WORDS(0x4);
+
+ struct {
+ u32 address_high;
+ u32 address_low;
+ ConditionMode mode;
+
+ GPUVAddr Address() const {
+ return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+ address_low);
+ }
+ } condition;
struct {
u32 tsc_address_high;
@@ -1223,6 +1256,10 @@ public:
return macro_memory;
}
+ bool ShouldExecute() const {
+ return execute_on;
+ }
+
private:
void InitializeRegisterDefaults();
@@ -1257,6 +1294,8 @@ private:
Upload::State upload_state;
+ bool execute_on{true};
+
/// Retrieves information about a specific TIC entry from the TIC buffer.
Texture::TICEntry GetTICEntry(u32 tic_index) const;
@@ -1284,6 +1323,9 @@ private:
/// Handles a write to the QUERY_GET register.
void ProcessQueryGet();
+ // Handles Conditional Rendering
+ void ProcessQueryCondition();
+
/// Handles writes to syncing register.
void ProcessSyncPoint();
@@ -1357,6 +1399,7 @@ ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
ASSERT_REG_POSITION(point_size, 0x546);
ASSERT_REG_POSITION(zeta_enable, 0x54E);
ASSERT_REG_POSITION(multisample_control, 0x54F);
+ASSERT_REG_POSITION(condition, 0x554);
ASSERT_REG_POSITION(tsc, 0x557);
ASSERT_REG_POSITION(polygon_offset_factor, 0x55b);
ASSERT_REG_POSITION(tic, 0x55D);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index b5f57e534..a28c04473 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -38,7 +38,7 @@ void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
}
void MaxwellDMA::HandleCopy() {
- LOG_WARNING(HW_GPU, "Requested a DMA copy");
+ LOG_TRACE(HW_GPU, "Requested a DMA copy");
const GPUVAddr source = regs.src_address.Address();
const GPUVAddr dest = regs.dst_address.Address();
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 8520a0143..083ee3304 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -560,6 +560,11 @@ union Instruction {
BitField<48, 16, u64> opcode;
union {
+ BitField<8, 5, ConditionCode> cc;
+ BitField<13, 1, u64> trigger;
+ } nop;
+
+ union {
BitField<8, 8, Register> gpr;
BitField<20, 24, s64> offset;
} gmem;
@@ -1516,6 +1521,7 @@ public:
TMML, // Texture Mip Map Level
SUST, // Surface Store
EXIT,
+ NOP,
IPA,
OUT_R, // Emit vertex/primitive
ISBERD,
@@ -1795,6 +1801,7 @@ private:
INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
INST("1101111101011---", Id::TMML, Type::Texture, "TMML"),
INST("11101011001-----", Id::SUST, Type::Image, "SUST"),
+ INST("0101000010110---", Id::NOP, Type::Trivial, "NOP"),
INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),
INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index e25754e37..1622332a4 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -29,7 +29,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
UNREACHABLE();
}
-GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
+GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
+ : system{system}, renderer{renderer}, is_async{is_async} {
auto& rasterizer{renderer.Rasterizer()};
memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
@@ -74,6 +75,51 @@ const DmaPusher& GPU::DmaPusher() const {
return *dma_pusher;
}
+void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
+ syncpoints[syncpoint_id]++;
+ std::lock_guard lock{sync_mutex};
+ if (!syncpt_interrupts[syncpoint_id].empty()) {
+ u32 value = syncpoints[syncpoint_id].load();
+ auto it = syncpt_interrupts[syncpoint_id].begin();
+ while (it != syncpt_interrupts[syncpoint_id].end()) {
+ if (value >= *it) {
+ TriggerCpuInterrupt(syncpoint_id, *it);
+ it = syncpt_interrupts[syncpoint_id].erase(it);
+ continue;
+ }
+ it++;
+ }
+ }
+}
+
+u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
+ return syncpoints[syncpoint_id].load();
+}
+
+void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+ auto& interrupt = syncpt_interrupts[syncpoint_id];
+ bool contains = std::any_of(interrupt.begin(), interrupt.end(),
+ [value](u32 in_value) { return in_value == value; });
+ if (contains) {
+ return;
+ }
+ syncpt_interrupts[syncpoint_id].emplace_back(value);
+}
+
+bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+ std::lock_guard lock{sync_mutex};
+ auto& interrupt = syncpt_interrupts[syncpoint_id];
+ const auto iter =
+ std::find_if(interrupt.begin(), interrupt.end(),
+ [value](u32 interrupt_value) { return value == interrupt_value; });
+
+ if (iter == interrupt.end()) {
+ return false;
+ }
+ interrupt.erase(iter);
+ return true;
+}
+
u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
ASSERT(format != RenderTargetFormat::NONE);
@@ -151,12 +197,12 @@ enum class BufferMethods {
NotifyIntr = 0x8,
WrcacheFlush = 0x9,
Unk28 = 0xA,
- Unk2c = 0xB,
+ UnkCacheFlush = 0xB,
RefCnt = 0x14,
SemaphoreAcquire = 0x1A,
SemaphoreRelease = 0x1B,
- Unk70 = 0x1C,
- Unk74 = 0x1D,
+ FenceValue = 0x1C,
+ FenceAction = 0x1D,
Unk78 = 0x1E,
Unk7c = 0x1F,
Yield = 0x20,
@@ -202,6 +248,10 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
case BufferMethods::SemaphoreAddressLow:
case BufferMethods::SemaphoreSequence:
case BufferMethods::RefCnt:
+ case BufferMethods::UnkCacheFlush:
+ case BufferMethods::WrcacheFlush:
+ case BufferMethods::FenceValue:
+ case BufferMethods::FenceAction:
break;
case BufferMethods::SemaphoreTrigger: {
ProcessSemaphoreTriggerMethod();
@@ -212,21 +262,11 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
break;
}
- case BufferMethods::WrcacheFlush: {
- // TODO(Kmather73): Research and implement this method.
- LOG_ERROR(HW_GPU, "Special puller engine method WrcacheFlush not implemented");
- break;
- }
case BufferMethods::Unk28: {
// TODO(Kmather73): Research and implement this method.
LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
break;
}
- case BufferMethods::Unk2c: {
- // TODO(Kmather73): Research and implement this method.
- LOG_ERROR(HW_GPU, "Special puller engine method Unk2c not implemented");
- break;
- }
case BufferMethods::SemaphoreAcquire: {
ProcessSemaphoreAcquire();
break;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 0ace0ff4f..87c96f46b 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -5,8 +5,12 @@
#pragma once
#include <array>
+#include <atomic>
+#include <list>
#include <memory>
+#include <mutex>
#include "common/common_types.h"
+#include "core/hle/service/nvdrv/nvdata.h"
#include "core/hle/service/nvflinger/buffer_queue.h"
#include "video_core/dma_pusher.h"
@@ -127,7 +131,7 @@ class MemoryManager;
class GPU {
public:
- explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
+ explicit GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async);
virtual ~GPU();
@@ -170,6 +174,22 @@ public:
/// Returns a reference to the GPU DMA pusher.
Tegra::DmaPusher& DmaPusher();
+ void IncrementSyncPoint(u32 syncpoint_id);
+
+ u32 GetSyncpointValue(u32 syncpoint_id) const;
+
+ void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+ bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+ std::unique_lock<std::mutex> LockSync() {
+ return std::unique_lock{sync_mutex};
+ }
+
+ bool IsAsync() const {
+ return is_async;
+ }
+
/// Returns a const reference to the GPU DMA pusher.
const Tegra::DmaPusher& DmaPusher() const;
@@ -200,7 +220,12 @@ public:
u32 semaphore_acquire;
u32 semaphore_release;
- INSERT_PADDING_WORDS(0xE4);
+ u32 fence_value;
+ union {
+ BitField<4, 4, u32> operation;
+ BitField<8, 8, u32> id;
+ } fence_action;
+ INSERT_PADDING_WORDS(0xE2);
// Puller state
u32 acquire_mode;
@@ -234,6 +259,9 @@ public:
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
+protected:
+ virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
+
private:
void ProcessBindMethod(const MethodCall& method_call);
void ProcessSemaphoreTriggerMethod();
@@ -252,6 +280,7 @@ private:
protected:
std::unique_ptr<Tegra::DmaPusher> dma_pusher;
VideoCore::RendererBase& renderer;
+ Core::System& system;
private:
std::unique_ptr<Tegra::MemoryManager> memory_manager;
@@ -268,6 +297,14 @@ private:
std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
/// Inline memory engine
std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+
+ std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
+
+ std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
+
+ std::mutex sync_mutex;
+
+ const bool is_async;
};
#define ASSERT_REG_POSITION(field_name, position) \
@@ -280,6 +317,8 @@ ASSERT_REG_POSITION(semaphore_trigger, 0x7);
ASSERT_REG_POSITION(reference_count, 0x14);
ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
ASSERT_REG_POSITION(semaphore_release, 0x1B);
+ASSERT_REG_POSITION(fence_value, 0x1C);
+ASSERT_REG_POSITION(fence_action, 0x1D);
ASSERT_REG_POSITION(acquire_mode, 0x100);
ASSERT_REG_POSITION(acquire_source, 0x101);
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index d4e2553a9..ea67be831 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -2,6 +2,8 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include "core/core.h"
+#include "core/hardware_interrupt_manager.h"
#include "video_core/gpu_asynch.h"
#include "video_core/gpu_thread.h"
#include "video_core/renderer_base.h"
@@ -9,7 +11,7 @@
namespace VideoCommon {
GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
- : GPU(system, renderer), gpu_thread{system} {}
+ : GPU(system, renderer, true), gpu_thread{system} {}
GPUAsynch::~GPUAsynch() = default;
@@ -38,4 +40,9 @@ void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
gpu_thread.FlushAndInvalidateRegion(addr, size);
}
+void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
+ auto& interrupt_manager = system.InterruptManager();
+ interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+}
+
} // namespace VideoCommon
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 30be74cba..36377d677 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -27,6 +27,9 @@ public:
void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+protected:
+ void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
+
private:
GPUThread::ThreadManager gpu_thread;
};
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 45e43b1dc..d4ead9c47 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -8,7 +8,7 @@
namespace VideoCommon {
GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
- : GPU(system, renderer) {}
+ : GPU(system, renderer, false) {}
GPUSynch::~GPUSynch() = default;
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 3031fcf72..07bcc47f1 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -25,6 +25,10 @@ public:
void FlushRegion(CacheAddr addr, u64 size) override;
void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+
+protected:
+ void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
+ [[maybe_unused]] u32 value) const override {}
};
} // namespace VideoCommon
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 3f0939ec9..b441e92b0 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -21,7 +21,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
MicroProfileOnThreadCreate("GpuThread");
// Wait for first GPU command before acquiring the window context
- state.WaitForCommands();
+ while (state.queue.Empty())
+ ;
// If emulation was stopped during disk shader loading, abort before trying to acquire context
if (!state.is_running) {
@@ -32,7 +33,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
CommandDataContainer next;
while (state.is_running) {
- state.WaitForCommands();
while (!state.queue.Empty()) {
state.queue.Pop(next);
if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
@@ -49,8 +49,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
} else {
UNREACHABLE();
}
- state.signaled_fence = next.fence;
- state.TrySynchronize();
+ state.signaled_fence.store(next.fence);
}
}
}
@@ -89,12 +88,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
}
void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
- if (state.queue.Empty()) {
- // It's quicker to invalidate a single region on the CPU if the queue is already empty
- system.Renderer().Rasterizer().InvalidateRegion(addr, size);
- } else {
- PushCommand(InvalidateRegionCommand(addr, size));
- }
+ system.Renderer().Rasterizer().InvalidateRegion(addr, size);
}
void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -105,22 +99,13 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
u64 ThreadManager::PushCommand(CommandData&& command_data) {
const u64 fence{++state.last_fence};
state.queue.Push(CommandDataContainer(std::move(command_data), fence));
- state.SignalCommands();
return fence;
}
MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
void SynchState::WaitForSynchronization(u64 fence) {
- if (signaled_fence >= fence) {
- return;
- }
-
- // Wait for the GPU to be idle (all commands to be executed)
- {
- MICROPROFILE_SCOPE(GPU_wait);
- std::unique_lock lock{synchronization_mutex};
- synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
- }
+ while (signaled_fence.load() < fence)
+ ;
}
} // namespace VideoCommon::GPUThread
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 05a168a72..1d9d0c39e 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -88,41 +88,9 @@ struct CommandDataContainer {
/// Struct used to synchronize the GPU thread
struct SynchState final {
std::atomic_bool is_running{true};
- std::atomic_int queued_frame_count{};
- std::mutex synchronization_mutex;
- std::mutex commands_mutex;
- std::condition_variable commands_condition;
- std::condition_variable synchronization_condition;
-
- /// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
- /// synchronized. This is entirely empirical.
- bool IsSynchronized() const {
- constexpr std::size_t max_queue_gap{5};
- return queue.Size() <= max_queue_gap;
- }
-
- void TrySynchronize() {
- if (IsSynchronized()) {
- std::lock_guard lock{synchronization_mutex};
- synchronization_condition.notify_one();
- }
- }
void WaitForSynchronization(u64 fence);
- void SignalCommands() {
- if (queue.Empty()) {
- return;
- }
-
- commands_condition.notify_one();
- }
-
- void WaitForCommands() {
- std::unique_lock lock{commands_mutex};
- commands_condition.wait(lock, [this] { return !queue.Empty(); });
- }
-
using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
CommandQueue queue;
u64 last_fence{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index c59e687b6..c28ae795c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -595,7 +595,13 @@ void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, boo
}
void RasterizerOpenGL::Clear() {
- const auto& regs = system.GPU().Maxwell3D().regs;
+ const auto& maxwell3d = system.GPU().Maxwell3D();
+
+ if (!maxwell3d.ShouldExecute()) {
+ return;
+ }
+
+ const auto& regs = maxwell3d.regs;
bool use_color{};
bool use_depth{};
bool use_stencil{};
@@ -697,6 +703,11 @@ void RasterizerOpenGL::DrawArrays() {
MICROPROFILE_SCOPE(OpenGL_Drawing);
auto& gpu = system.GPU().Maxwell3D();
+
+ if (!gpu.ShouldExecute()) {
+ return;
+ }
+
const auto& regs = gpu.regs;
SyncColorMask();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 8fcd39a69..408332f90 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -137,7 +137,6 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size());
const auto& format{tex_format_tuples[static_cast<std::size_t>(pixel_format)]};
- ASSERT(component_type == format.component_type);
return format;
}
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index fdcc970ff..ec3a76690 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -15,7 +15,7 @@
#include "video_core/shader/shader_ir.h"
namespace VideoCommon::Shader {
-
+namespace {
using Tegra::Shader::Instruction;
using Tegra::Shader::OpCode;
@@ -29,8 +29,7 @@ struct Query {
struct BlockStack {
BlockStack() = default;
- BlockStack(const BlockStack& b) = default;
- BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
+ explicit BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
std::stack<u32> ssy_stack{};
std::stack<u32> pbk_stack{};
};
@@ -58,7 +57,7 @@ struct BlockInfo {
struct CFGRebuildState {
explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size,
const u32 start)
- : program_code{program_code}, program_size{program_size}, start{start} {}
+ : start{start}, program_code{program_code}, program_size{program_size} {}
u32 start{};
std::vector<BlockInfo> block_info{};
@@ -85,7 +84,7 @@ std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address)
return {BlockCollision::Inside, index};
}
}
- return {BlockCollision::None, -1};
+ return {BlockCollision::None, 0xFFFFFFFF};
}
struct ParseInfo {
@@ -365,27 +364,29 @@ bool TryQuery(CFGRebuildState& state) {
const auto gather_end = labels.upper_bound(block.end);
while (gather_start != gather_end) {
cc.push(gather_start->second);
- gather_start++;
+ ++gather_start;
}
};
if (state.queries.empty()) {
return false;
}
+
Query& q = state.queries.front();
const u32 block_index = state.registered[q.address];
BlockInfo& block = state.block_info[block_index];
- // If the block is visted, check if the stacks match, else gather the ssy/pbk
+ // If the block is visited, check if the stacks match, else gather the ssy/pbk
// labels into the current stack and look if the branch at the end of the block
// consumes a label. Schedule new queries accordingly
if (block.visited) {
BlockStack& stack = state.stacks[q.address];
- const bool all_okay = (stack.ssy_stack.size() == 0 || q.ssy_stack == stack.ssy_stack) &&
- (stack.pbk_stack.size() == 0 || q.pbk_stack == stack.pbk_stack);
+ const bool all_okay = (stack.ssy_stack.empty() || q.ssy_stack == stack.ssy_stack) &&
+ (stack.pbk_stack.empty() || q.pbk_stack == stack.pbk_stack);
state.queries.pop_front();
return all_okay;
}
block.visited = true;
- state.stacks[q.address] = BlockStack{q};
+ state.stacks.insert_or_assign(q.address, BlockStack{q});
+
Query q2(q);
state.queries.pop_front();
gather_labels(q2.ssy_stack, state.ssy_labels, block);
@@ -394,6 +395,7 @@ bool TryQuery(CFGRebuildState& state) {
q2.address = block.end + 1;
state.queries.push_back(q2);
}
+
Query conditional_query{q2};
if (block.branch.is_sync) {
if (block.branch.address == unassigned_branch) {
@@ -408,13 +410,15 @@ bool TryQuery(CFGRebuildState& state) {
conditional_query.pbk_stack.pop();
}
conditional_query.address = block.branch.address;
- state.queries.push_back(conditional_query);
+ state.queries.push_back(std::move(conditional_query));
return true;
}
+} // Anonymous namespace
-std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
- u32 start_address) {
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+ std::size_t program_size, u32 start_address) {
CFGRebuildState state{program_code, program_size, start_address};
+
// Inspect Code and generate blocks
state.labels.clear();
state.labels.emplace(start_address);
@@ -424,10 +428,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
return {};
}
}
+
// Decompile Stacks
- Query start_query{};
- start_query.address = state.start;
- state.queries.push_back(start_query);
+ state.queries.push_back(Query{state.start, {}, {}});
bool decompiled = true;
while (!state.queries.empty()) {
if (!TryQuery(state)) {
@@ -435,14 +438,15 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
break;
}
}
+
// Sort and organize results
std::sort(state.block_info.begin(), state.block_info.end(),
- [](const BlockInfo& a, const BlockInfo& b) -> bool { return a.start < b.start; });
+ [](const BlockInfo& a, const BlockInfo& b) { return a.start < b.start; });
ShaderCharacteristics result_out{};
result_out.decompilable = decompiled;
result_out.start = start_address;
result_out.end = start_address;
- for (auto& block : state.block_info) {
+ for (const auto& block : state.block_info) {
ShaderBlock new_block{};
new_block.start = block.start;
new_block.end = block.end;
@@ -457,8 +461,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
}
if (result_out.decompilable) {
result_out.labels = std::move(state.labels);
- return {result_out};
+ return {std::move(result_out)};
}
+
// If it's not decompilable, merge the unlabelled blocks together
auto back = result_out.blocks.begin();
auto next = std::next(back);
@@ -469,8 +474,8 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
continue;
}
back = next;
- next++;
+ ++next;
}
- return {result_out};
+ return {std::move(result_out)};
}
} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h
index 5e8ea3271..b0a5e4f8c 100644
--- a/src/video_core/shader/control_flow.h
+++ b/src/video_core/shader/control_flow.h
@@ -4,7 +4,6 @@
#pragma once
-#include <cstring>
#include <list>
#include <optional>
#include <unordered_set>
@@ -26,27 +25,44 @@ struct Condition {
bool IsUnconditional() const {
return predicate == Pred::UnusedIndex && cc == ConditionCode::T;
}
+
bool operator==(const Condition& other) const {
return std::tie(predicate, cc) == std::tie(other.predicate, other.cc);
}
+
+ bool operator!=(const Condition& other) const {
+ return !operator==(other);
+ }
};
struct ShaderBlock {
- u32 start{};
- u32 end{};
- bool ignore_branch{};
struct Branch {
Condition cond{};
bool kills{};
s32 address{};
+
bool operator==(const Branch& b) const {
return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address);
}
- } branch{};
+
+ bool operator!=(const Branch& b) const {
+ return !operator==(b);
+ }
+ };
+
+ u32 start{};
+ u32 end{};
+ bool ignore_branch{};
+ Branch branch{};
+
bool operator==(const ShaderBlock& sb) const {
return std::tie(start, end, ignore_branch, branch) ==
std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch);
}
+
+ bool operator!=(const ShaderBlock& sb) const {
+ return !operator==(sb);
+ }
};
struct ShaderCharacteristics {
@@ -57,7 +73,7 @@ struct ShaderCharacteristics {
std::unordered_set<u32> labels{};
};
-std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
- u32 start_address);
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+ std::size_t program_size, u32 start_address);
} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index afffd157f..b547d8323 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -47,14 +47,14 @@ void ShaderIR::Decode() {
if (shader_info.decompilable) {
disable_flow_stack = true;
const auto insert_block = [this](NodeBlock& nodes, u32 label) {
- if (label == exit_branch) {
+ if (label == static_cast<u32>(exit_branch)) {
return;
}
basic_blocks.insert({label, nodes});
};
const auto& blocks = shader_info.blocks;
NodeBlock current_block;
- u32 current_label = exit_branch;
+ u32 current_label = static_cast<u32>(exit_branch);
for (auto& block : blocks) {
if (shader_info.labels.count(block.start) != 0) {
insert_block(current_block, current_label);
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 87d8fecaa..1473c282a 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -42,11 +42,14 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
case OpCode::Id::FMUL_R:
case OpCode::Id::FMUL_IMM: {
// FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
- UNIMPLEMENTED_IF_MSG(instr.fmul.tab5cb8_2 != 0, "FMUL tab5cb8_2({}) is not implemented",
- instr.fmul.tab5cb8_2.Value());
- UNIMPLEMENTED_IF_MSG(
- instr.fmul.tab5c68_0 != 1, "FMUL tab5cb8_0({}) is not implemented",
- instr.fmul.tab5c68_0.Value()); // SMO typical sends 1 here which seems to be the default
+ if (instr.fmul.tab5cb8_2 != 0) {
+ LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented",
+ instr.fmul.tab5cb8_2.Value());
+ }
+ if (instr.fmul.tab5c68_0 != 1) {
+ LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented",
+ instr.fmul.tab5c68_0.Value());
+ }
op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index 7bcf38f23..6466fc011 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -23,7 +23,9 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
}
} else {
- UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
+ if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) {
+ LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+ }
}
Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);
diff --git a/src/video_core/shader/decode/ffma.cpp b/src/video_core/shader/decode/ffma.cpp
index 29be25ca3..ca2f39e8d 100644
--- a/src/video_core/shader/decode/ffma.cpp
+++ b/src/video_core/shader/decode/ffma.cpp
@@ -18,10 +18,12 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) {
const auto opcode = OpCode::Decode(instr);
UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented");
- UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_0 != 1, "FFMA tab5980_0({}) not implemented",
- instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO
- UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_1 != 0, "FFMA tab5980_1({}) not implemented",
- instr.ffma.tab5980_1.Value());
+ if (instr.ffma.tab5980_0 != 1) {
+ LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value());
+ }
+ if (instr.ffma.tab5980_1 != 0) {
+ LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value());
+ }
const Node op_a = GetRegister(instr.gpr8);
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index ad180d6df..a6c082cc9 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -18,7 +18,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
const Instruction instr = {program_code[pc]};
const auto opcode = OpCode::Decode(instr);
- UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
+ DEBUG_ASSERT(instr.hsetp2.ftz == 0);
Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
@@ -52,15 +52,15 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
}
const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op);
- const Node pred39 = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred);
+ const Node combined_pred = GetPredicate(instr.hsetp2.pred3, instr.hsetp2.neg_pred);
const auto Write = [&](u64 dest, Node src) {
- SetPredicate(bb, dest, Operation(combiner, std::move(src), pred39));
+ SetPredicate(bb, dest, Operation(combiner, std::move(src), combined_pred));
};
const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b);
const u64 first = instr.hsetp2.pred0;
- const u64 second = instr.hsetp2.pred3;
+ const u64 second = instr.hsetp2.pred39;
if (h_and) {
const Node joined = Operation(OperationCode::LogicalAnd2, comparison);
Write(first, joined);
diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp
index c3bcf1ae9..5b44cb79c 100644
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -22,9 +22,9 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
const auto opcode = OpCode::Decode(instr);
if (opcode->get().GetId() == OpCode::Id::HFMA2_RR) {
- UNIMPLEMENTED_IF(instr.hfma2.rr.precision != HalfPrecision::None);
+ DEBUG_ASSERT(instr.hfma2.rr.precision == HalfPrecision::None);
} else {
- UNIMPLEMENTED_IF(instr.hfma2.precision != HalfPrecision::None);
+ DEBUG_ASSERT(instr.hfma2.precision == HalfPrecision::None);
}
constexpr auto identity = HalfType::H0_H1;
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index c0f64d7a0..ac0e764d6 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -22,6 +22,12 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
const auto opcode = OpCode::Decode(instr);
switch (opcode->get().GetId()) {
+ case OpCode::Id::NOP: {
+ UNIMPLEMENTED_IF(instr.nop.cc != Tegra::Shader::ConditionCode::T);
+ UNIMPLEMENTED_IF(instr.nop.trigger != 0);
+ // With the previous preconditions, this instruction is a no-operation.
+ break;
+ }
case OpCode::Id::EXIT: {
const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "EXIT condition code used: {}",
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index a53e02253..55f5949e4 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -59,8 +59,8 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
return TrackCbuf(source, code, new_cursor);
}
if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
- for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
- if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) {
+ for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
+ if (auto found = TrackCbuf((*operation)[i - 1], code, cursor); std::get<0>(found)) {
// Cbuf found in operand.
return found;
}
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 6af9044ca..683c49207 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -24,9 +24,8 @@ StagingCache::StagingCache() = default;
StagingCache::~StagingCache() = default;
SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params)
- : params{params}, mipmap_sizes(params.num_levels),
- mipmap_offsets(params.num_levels), gpu_addr{gpu_addr}, host_memory_size{
- params.GetHostSizeInBytes()} {
+ : params{params}, host_memory_size{params.GetHostSizeInBytes()}, gpu_addr{gpu_addr},
+ mipmap_sizes(params.num_levels), mipmap_offsets(params.num_levels) {
std::size_t offset = 0;
for (u32 level = 0; level < params.num_levels; ++level) {
const std::size_t mipmap_size{params.GetGuestMipmapSize(level)};