summaryrefslogtreecommitdiffstats
path: root/src/video_core/engines
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/video_core/engines/const_buffer_info.h17
-rw-r--r--src/video_core/engines/engine_upload.cpp6
-rw-r--r--src/video_core/engines/engine_upload.h6
-rw-r--r--src/video_core/engines/fermi_2d.cpp32
-rw-r--r--src/video_core/engines/fermi_2d.h56
-rw-r--r--src/video_core/engines/kepler_compute.cpp62
-rw-r--r--src/video_core/engines/kepler_compute.h23
-rw-r--r--src/video_core/engines/kepler_memory.cpp4
-rw-r--r--src/video_core/engines/kepler_memory.h1
-rw-r--r--src/video_core/engines/maxwell_3d.cpp464
-rw-r--r--src/video_core/engines/maxwell_3d.h186
-rw-r--r--src/video_core/engines/maxwell_dma.cpp54
-rw-r--r--src/video_core/engines/maxwell_dma.h13
-rw-r--r--src/video_core/engines/shader_bytecode.h233
14 files changed, 958 insertions, 199 deletions
diff --git a/src/video_core/engines/const_buffer_info.h b/src/video_core/engines/const_buffer_info.h
new file mode 100644
index 000000000..d8f672462
--- /dev/null
+++ b/src/video_core/engines/const_buffer_info.h
@@ -0,0 +1,17 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+struct ConstBufferInfo {
+ GPUVAddr address;
+ u32 size;
+ bool enabled;
+};
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index 082a40cd9..d44ad0cd8 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -36,10 +36,10 @@ void State::ProcessData(const u32 data, const bool is_last_call) {
} else {
UNIMPLEMENTED_IF(regs.dest.z != 0);
UNIMPLEMENTED_IF(regs.dest.depth != 1);
- UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
- UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
+ UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0);
+ UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0);
const std::size_t dst_size = Tegra::Texture::CalculateSize(
- true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
+ true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0);
tmp_buffer.resize(dst_size);
memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h
index ef4f5839a..462da419e 100644
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@@ -39,15 +39,15 @@ struct Registers {
}
u32 BlockWidth() const {
- return 1U << block_width.Value();
+ return block_width.Value();
}
u32 BlockHeight() const {
- return 1U << block_height.Value();
+ return block_height.Value();
}
u32 BlockDepth() const {
- return 1U << block_depth.Value();
+ return block_depth.Value();
}
} dest;
};
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 55966eef1..7ff44f06d 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -4,15 +4,13 @@
#include "common/assert.h"
#include "common/logging/log.h"
-#include "common/math_util.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
namespace Tegra::Engines {
-Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
- : rasterizer{rasterizer}, memory_manager{memory_manager} {}
+Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
ASSERT_MSG(method_call.method < Regs::NUM_REGS,
@@ -31,25 +29,35 @@ void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
}
void Fermi2D::HandleSurfaceCopy() {
- LOG_WARNING(HW_GPU, "Requested a surface copy with operation {}",
- static_cast<u32>(regs.operation));
+ LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}",
+ static_cast<u32>(regs.operation));
// TODO(Subv): Only raw copies are implemented.
- ASSERT(regs.operation == Regs::Operation::SrcCopy);
+ ASSERT(regs.operation == Operation::SrcCopy);
const u32 src_blit_x1{static_cast<u32>(regs.blit_src_x >> 32)};
const u32 src_blit_y1{static_cast<u32>(regs.blit_src_y >> 32)};
- const u32 src_blit_x2{
- static_cast<u32>((regs.blit_src_x + (regs.blit_dst_width * regs.blit_du_dx)) >> 32)};
- const u32 src_blit_y2{
- static_cast<u32>((regs.blit_src_y + (regs.blit_dst_height * regs.blit_dv_dy)) >> 32)};
-
+ u32 src_blit_x2, src_blit_y2;
+ if (regs.blit_control.origin == Origin::Corner) {
+ src_blit_x2 =
+ static_cast<u32>((regs.blit_src_x + (regs.blit_du_dx * regs.blit_dst_width)) >> 32);
+ src_blit_y2 =
+ static_cast<u32>((regs.blit_src_y + (regs.blit_dv_dy * regs.blit_dst_height)) >> 32);
+ } else {
+ src_blit_x2 = static_cast<u32>((regs.blit_src_x >> 32) + regs.blit_dst_width);
+ src_blit_y2 = static_cast<u32>((regs.blit_src_y >> 32) + regs.blit_dst_height);
+ }
const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y,
regs.blit_dst_x + regs.blit_dst_width,
regs.blit_dst_y + regs.blit_dst_height};
+ Config copy_config;
+ copy_config.operation = regs.operation;
+ copy_config.filter = regs.blit_control.filter;
+ copy_config.src_rect = src_rect;
+ copy_config.dst_rect = dst_rect;
- if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst, src_rect, dst_rect)) {
+ if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) {
UNIMPLEMENTED();
}
}
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 45f59a4d9..0901cf2fa 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -9,6 +9,7 @@
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
+#include "common/math_util.h"
#include "video_core/gpu.h"
namespace Tegra {
@@ -32,12 +33,32 @@ namespace Tegra::Engines {
class Fermi2D final {
public:
- explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
+ explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer);
~Fermi2D() = default;
/// Write the value to the register identified by method.
void CallMethod(const GPU::MethodCall& method_call);
+ enum class Origin : u32 {
+ Center = 0,
+ Corner = 1,
+ };
+
+ enum class Filter : u32 {
+ PointSample = 0, // Nearest
+ Linear = 1,
+ };
+
+ enum class Operation : u32 {
+ SrcCopyAnd = 0,
+ ROPAnd = 1,
+ Blend = 2,
+ SrcCopy = 3,
+ ROP = 4,
+ SrcCopyPremult = 5,
+ BlendPremult = 6,
+ };
+
struct Regs {
static constexpr std::size_t NUM_REGS = 0x258;
@@ -63,32 +84,19 @@ public:
}
u32 BlockWidth() const {
- // The block width is stored in log2 format.
- return 1 << block_width;
+ return block_width.Value();
}
u32 BlockHeight() const {
- // The block height is stored in log2 format.
- return 1 << block_height;
+ return block_height.Value();
}
u32 BlockDepth() const {
- // The block depth is stored in log2 format.
- return 1 << block_depth;
+ return block_depth.Value();
}
};
static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
- enum class Operation : u32 {
- SrcCopyAnd = 0,
- ROPAnd = 1,
- Blend = 2,
- SrcCopy = 3,
- ROP = 4,
- SrcCopyPremult = 5,
- BlendPremult = 6,
- };
-
union {
struct {
INSERT_PADDING_WORDS(0x80);
@@ -105,7 +113,11 @@ public:
INSERT_PADDING_WORDS(0x177);
- u32 blit_control;
+ union {
+ u32 raw;
+ BitField<0, 1, Origin> origin;
+ BitField<4, 1, Filter> filter;
+ } blit_control;
INSERT_PADDING_WORDS(0x8);
@@ -124,9 +136,15 @@ public:
};
} regs{};
+ struct Config {
+ Operation operation;
+ Filter filter;
+ Common::Rectangle<u32> src_rect;
+ Common::Rectangle<u32> dst_rect;
+ };
+
private:
VideoCore::RasterizerInterface& rasterizer;
- MemoryManager& memory_manager;
/// Performs the copy from the source surface to the destination surface as configured in the
/// registers.
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 7404a8163..63d449135 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -2,6 +2,7 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include <bitset>
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
@@ -37,7 +38,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
const bool is_last_call = method_call.IsLastCall();
upload_state.ProcessData(method_call.argument, is_last_call);
if (is_last_call) {
- system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+ system.GPU().Maxwell3D().dirty.OnMemoryWrite();
}
break;
}
@@ -49,14 +50,67 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
}
}
-void KeplerCompute::ProcessLaunch() {
+Tegra::Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const {
+ const std::bitset<8> cbuf_mask = launch_description.const_buffer_enable_mask.Value();
+ ASSERT(cbuf_mask[regs.tex_cb_index]);
+
+ const auto& texinfo = launch_description.const_buffer_config[regs.tex_cb_index];
+ ASSERT(texinfo.Address() != 0);
+
+ const GPUVAddr address = texinfo.Address() + offset * sizeof(Texture::TextureHandle);
+ ASSERT(address < texinfo.Address() + texinfo.size);
+
+ const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(address)};
+ return GetTextureInfo(tex_handle, offset);
+}
+Texture::FullTextureInfo KeplerCompute::GetTextureInfo(const Texture::TextureHandle tex_handle,
+ std::size_t offset) const {
+ return Texture::FullTextureInfo{static_cast<u32>(offset), GetTICEntry(tex_handle.tic_id),
+ GetTSCEntry(tex_handle.tsc_id)};
+}
+
+u32 KeplerCompute::AccessConstBuffer32(u64 const_buffer, u64 offset) const {
+ const auto& buffer = launch_description.const_buffer_config[const_buffer];
+ u32 result;
+ std::memcpy(&result, memory_manager.GetPointer(buffer.Address() + offset), sizeof(u32));
+ return result;
+}
+
+void KeplerCompute::ProcessLaunch() {
const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
- const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
- LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
+ const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
+ LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
+
+ rasterizer.DispatchCompute(code_addr);
+}
+
+Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const {
+ const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};
+
+ Texture::TICEntry tic_entry;
+ memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
+
+ const auto r_type{tic_entry.r_type.Value()};
+ const auto g_type{tic_entry.g_type.Value()};
+ const auto b_type{tic_entry.b_type.Value()};
+ const auto a_type{tic_entry.a_type.Value()};
+
+ // TODO(Subv): Different data types for separate components are not supported
+ DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
+
+ return tic_entry;
+}
+
+Texture::TSCEntry KeplerCompute::GetTSCEntry(u32 tsc_index) const {
+ const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};
+
+ Texture::TSCEntry tsc_entry;
+ memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
+ return tsc_entry;
}
} // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 6a3309a2c..90cf650d2 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -12,6 +12,7 @@
#include "common/common_types.h"
#include "video_core/engines/engine_upload.h"
#include "video_core/gpu.h"
+#include "video_core/textures/texture.h"
namespace Core {
class System;
@@ -111,7 +112,7 @@ public:
INSERT_PADDING_WORDS(0x3FE);
- u32 texture_const_buffer_index;
+ u32 tex_cb_index;
INSERT_PADDING_WORDS(0x374);
};
@@ -149,7 +150,7 @@ public:
union {
BitField<0, 8, u32> const_buffer_enable_mask;
BitField<29, 2, u32> cache_layout;
- } memory_config;
+ };
INSERT_PADDING_WORDS(0x8);
@@ -194,6 +195,14 @@ public:
/// Write the value to the register identified by method.
void CallMethod(const GPU::MethodCall& method_call);
+ Tegra::Texture::FullTextureInfo GetTexture(std::size_t offset) const;
+
+ /// Given a Texture Handle, returns the TSC and TIC entries.
+ Texture::FullTextureInfo GetTextureInfo(const Texture::TextureHandle tex_handle,
+ std::size_t offset) const;
+
+ u32 AccessConstBuffer32(u64 const_buffer, u64 offset) const;
+
private:
Core::System& system;
VideoCore::RasterizerInterface& rasterizer;
@@ -201,6 +210,12 @@ private:
Upload::State upload_state;
void ProcessLaunch();
+
+ /// Retrieves information about a specific TIC entry from the TIC buffer.
+ Texture::TICEntry GetTICEntry(u32 tic_index) const;
+
+ /// Retrieves information about a specific TSC entry from the TSC buffer.
+ Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;
};
#define ASSERT_REG_POSITION(field_name, position) \
@@ -218,12 +233,12 @@ ASSERT_REG_POSITION(launch, 0xAF);
ASSERT_REG_POSITION(tsc, 0x557);
ASSERT_REG_POSITION(tic, 0x55D);
ASSERT_REG_POSITION(code_loc, 0x582);
-ASSERT_REG_POSITION(texture_const_buffer_index, 0x982);
+ASSERT_REG_POSITION(tex_cb_index, 0x982);
ASSERT_LAUNCH_PARAM_POSITION(program_start, 0x8);
ASSERT_LAUNCH_PARAM_POSITION(grid_dim_x, 0xC);
ASSERT_LAUNCH_PARAM_POSITION(shared_alloc, 0x11);
ASSERT_LAUNCH_PARAM_POSITION(block_dim_x, 0x12);
-ASSERT_LAUNCH_PARAM_POSITION(memory_config, 0x14);
+ASSERT_LAUNCH_PARAM_POSITION(const_buffer_enable_mask, 0x14);
ASSERT_LAUNCH_PARAM_POSITION(const_buffer_config, 0x1D);
#undef ASSERT_REG_POSITION
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 0561f676c..fa4a7c5c1 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -15,7 +15,7 @@
namespace Tegra::Engines {
KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
- : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {}
+ : system{system}, upload_state{memory_manager, regs.upload} {}
KeplerMemory::~KeplerMemory() = default;
@@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
const bool is_last_call = method_call.IsLastCall();
upload_state.ProcessData(method_call.argument, is_last_call);
if (is_last_call) {
- system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+ system.GPU().Maxwell3D().dirty.OnMemoryWrite();
}
break;
}
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index f3bc675a9..e0e25c321 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -65,7 +65,6 @@ public:
private:
Core::System& system;
- MemoryManager& memory_manager;
Upload::State upload_state;
};
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 39968d403..b318aedb8 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -22,6 +22,7 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste
MemoryManager& memory_manager)
: system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+ InitDirtySettings();
InitializeRegisterDefaults();
}
@@ -69,6 +70,10 @@ void Maxwell3D::InitializeRegisterDefaults() {
regs.stencil_back_func_mask = 0xFFFFFFFF;
regs.stencil_back_mask = 0xFFFFFFFF;
+ regs.depth_test_func = Regs::ComparisonOp::Always;
+ regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise;
+ regs.cull.cull_face = Regs::Cull::CullFace::Back;
+
// TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
// register carrying a default value. Assume it's OpenGL's default (1).
regs.point_size = 1.0f;
@@ -84,23 +89,180 @@ void Maxwell3D::InitializeRegisterDefaults() {
// Commercial games seem to assume this value is enabled and nouveau sets this value manually.
regs.rt_separate_frag_data = 1;
+
+ // Some games (like Super Mario Odyssey) assume that SRGB is enabled.
+ regs.framebuffer_srgb = 1;
+ mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_end_gl)] = true;
+ mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)] = true;
+ mme_inline[MAXWELL3D_REG_INDEX(vertex_buffer.count)] = true;
+ mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
}
-void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
+#define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name))
+
+void Maxwell3D::InitDirtySettings() {
+ const auto set_block = [this](const u32 start, const u32 range, const u8 position) {
+ const auto start_itr = dirty_pointers.begin() + start;
+ const auto end_itr = start_itr + range;
+ std::fill(start_itr, end_itr, position);
+ };
+ dirty.regs.fill(true);
+
+ // Init Render Targets
+ constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
+ constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt);
+ constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8;
+ u32 rt_dirty_reg = DIRTY_REGS_POS(render_target);
+ for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) {
+ set_block(rt_reg, registers_per_rt, rt_dirty_reg);
+ rt_dirty_reg++;
+ }
+ constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer);
+ dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag;
+ dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag;
+ dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag;
+ constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
+ constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta);
+ set_block(zeta_reg, registers_in_zeta, depth_buffer_flag);
+
+ // Init Vertex Arrays
+ constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array);
+ constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32);
+ constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays;
+ u32 va_reg = DIRTY_REGS_POS(vertex_array);
+ u32 vi_reg = DIRTY_REGS_POS(vertex_instance);
+ for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end;
+ vertex_reg += vertex_array_size) {
+ set_block(vertex_reg, 3, va_reg);
+ // The divisor concerns vertex array instances
+ dirty_pointers[vertex_reg + 3] = vi_reg;
+ va_reg++;
+ vi_reg++;
+ }
+ constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit);
+ constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32);
+ constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays;
+ va_reg = DIRTY_REGS_POS(vertex_array);
+ for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end;
+ vertex_reg += vertex_limit_size) {
+ set_block(vertex_reg, vertex_limit_size, va_reg);
+ va_reg++;
+ }
+ constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays);
+ constexpr u32 vertex_instance_size =
+ sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32);
+ constexpr u32 vertex_instance_end =
+ vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays;
+ vi_reg = DIRTY_REGS_POS(vertex_instance);
+ for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end;
+ vertex_reg += vertex_instance_size) {
+ set_block(vertex_reg, vertex_instance_size, vi_reg);
+ vi_reg++;
+ }
+ set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(),
+ DIRTY_REGS_POS(vertex_attrib_format));
+
+ // Init Shaders
+ constexpr u32 shader_registers_count =
+ sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
+ set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count,
+ DIRTY_REGS_POS(shaders));
+
+ // State
+
+ // Viewport
+ constexpr u32 viewport_dirty_reg = DIRTY_REGS_POS(viewport);
+ constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports);
+ constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32);
+ set_block(viewport_start, viewport_size, viewport_dirty_reg);
+ constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control);
+ constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32);
+ set_block(view_volume_start, view_volume_size, viewport_dirty_reg);
+
+ // Viewport transformation
+ constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform);
+ constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32);
+ set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform));
+
+ // Cullmode
+ constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull);
+ constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32);
+ set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode));
+
+ // Screen y control
+ dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control);
+
+ // Primitive Restart
+ constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart);
+ constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32);
+ set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart));
+
+ // Depth Test
+ constexpr u32 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test);
+ dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg;
+
+ // Stencil Test
+ constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test);
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg;
+
+ // Color Mask
+ constexpr u32 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask);
+ dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg;
+ set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32),
+ color_mask_dirty_reg);
+ // Blend State
+ constexpr u32 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state);
+ set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32),
+ blend_state_dirty_reg);
+ dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg;
+ set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg);
+ set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32),
+ blend_state_dirty_reg);
+
+ // Scissor State
+ constexpr u32 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test);
+ set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32),
+ scissor_test_dirty_reg);
+
+ // Polygon Offset
+ constexpr u32 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset);
+ dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg;
+ dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg;
+}
+
+void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) {
// Reset the current macro.
executing_macro = 0;
// Lookup the macro offset
- const u32 entry{(method - MacroRegistersStart) >> 1};
- const auto& search{macro_offsets.find(entry)};
- if (search == macro_offsets.end()) {
- LOG_CRITICAL(HW_GPU, "macro not found for method 0x{:X}!", method);
- UNREACHABLE();
- return;
- }
+ const u32 entry = ((method - MacroRegistersStart) >> 1) % macro_positions.size();
// Execute the current macro.
- macro_interpreter.Execute(search->second, std::move(parameters));
+ macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters);
+ if (mme_draw.current_mode != MMEDrawMode::Undefined) {
+ FlushMMEInlineDraw();
+ }
}
void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
@@ -108,6 +270,14 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
const u32 method = method_call.method;
+ if (method == cb_data_state.current) {
+ regs.reg_array[method] = method_call.argument;
+ ProcessCBData(method_call.argument);
+ return;
+ } else if (cb_data_state.current != null_cb_data) {
+ FinishCBData();
+ }
+
// It is an error to write to a register other than the current macro's ARG register before it
// has finished execution.
if (executing_macro != 0) {
@@ -129,7 +299,8 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
// Call the macro when there are no more parameters in the command buffer
if (method_call.IsLastCall()) {
- CallMacroMethod(executing_macro, std::move(macro_params));
+ CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+ macro_params.clear();
}
return;
}
@@ -143,49 +314,19 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
if (regs.reg_array[method] != method_call.argument) {
regs.reg_array[method] = method_call.argument;
- // Color buffers
- constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt);
- constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
- if (method >= first_rt_reg &&
- method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
- const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt;
- dirty_flags.color_buffer.set(rt_index);
- }
-
- // Zeta buffer
- constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
- if (method == MAXWELL3D_REG_INDEX(zeta_enable) ||
- method == MAXWELL3D_REG_INDEX(zeta_width) ||
- method == MAXWELL3D_REG_INDEX(zeta_height) ||
- (method >= MAXWELL3D_REG_INDEX(zeta) &&
- method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
- dirty_flags.zeta_buffer = true;
- }
-
- // Shader
- constexpr u32 shader_registers_count =
- sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
- if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
- method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
- dirty_flags.shaders = true;
- }
-
- // Vertex format
- if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
- method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
- dirty_flags.vertex_attrib_format = true;
- }
-
- // Vertex buffer
- if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
- method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) {
- dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
- } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
- method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) {
- dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
- } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
- method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) {
- dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
+ const std::size_t dirty_reg = dirty_pointers[method];
+ if (dirty_reg) {
+ dirty.regs[dirty_reg] = true;
+ if (dirty_reg >= DIRTY_REGS_POS(vertex_array) &&
+ dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) {
+ dirty.vertex_array_buffers = true;
+ } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) &&
+ dirty_reg < DIRTY_REGS_POS(vertex_instances)) {
+ dirty.vertex_instances = true;
+ } else if (dirty_reg >= DIRTY_REGS_POS(render_target) &&
+ dirty_reg < DIRTY_REGS_POS(render_settings)) {
+ dirty.render_settings = true;
+ }
}
}
@@ -198,6 +339,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
ProcessMacroBind(method_call.argument);
break;
}
+ case MAXWELL3D_REG_INDEX(firmware[4]): {
+ ProcessFirmwareCall4();
+ break;
+ }
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
@@ -214,7 +359,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
- ProcessCBData(method_call.argument);
+ StartCBData(method);
break;
}
case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): {
@@ -249,6 +394,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
ProcessQueryGet();
break;
}
+ case MAXWELL3D_REG_INDEX(condition.mode): {
+ ProcessQueryCondition();
+ break;
+ }
case MAXWELL3D_REG_INDEX(sync_info): {
ProcessSyncPoint();
break;
@@ -261,7 +410,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
const bool is_last_call = method_call.IsLastCall();
upload_state.ProcessData(method_call.argument, is_last_call);
if (is_last_call) {
- dirty_flags.OnMemoryWrite();
+ dirty.OnMemoryWrite();
}
break;
}
@@ -274,6 +423,97 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
}
}
+void Maxwell3D::StepInstance(const MMEDrawMode expected_mode, const u32 count) {
+ if (mme_draw.current_mode == MMEDrawMode::Undefined) {
+ if (mme_draw.gl_begin_consume) {
+ mme_draw.current_mode = expected_mode;
+ mme_draw.current_count = count;
+ mme_draw.instance_count = 1;
+ mme_draw.gl_begin_consume = false;
+ mme_draw.gl_end_count = 0;
+ }
+ return;
+ } else {
+ if (mme_draw.current_mode == expected_mode && count == mme_draw.current_count &&
+ mme_draw.instance_mode && mme_draw.gl_begin_consume) {
+ mme_draw.instance_count++;
+ mme_draw.gl_begin_consume = false;
+ return;
+ } else {
+ FlushMMEInlineDraw();
+ }
+ }
+ // Tail call in case it needs to retry.
+ StepInstance(expected_mode, count);
+}
+
+void Maxwell3D::CallMethodFromMME(const GPU::MethodCall& method_call) {
+ const u32 method = method_call.method;
+ if (mme_inline[method]) {
+ regs.reg_array[method] = method_call.argument;
+ if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count) ||
+ method == MAXWELL3D_REG_INDEX(index_array.count)) {
+ const MMEDrawMode expected_mode = method == MAXWELL3D_REG_INDEX(vertex_buffer.count)
+ ? MMEDrawMode::Array
+ : MMEDrawMode::Indexed;
+ StepInstance(expected_mode, method_call.argument);
+ } else if (method == MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)) {
+ mme_draw.instance_mode =
+ (regs.draw.instance_next != 0) || (regs.draw.instance_cont != 0);
+ mme_draw.gl_begin_consume = true;
+ } else {
+ mme_draw.gl_end_count++;
+ }
+ } else {
+ if (mme_draw.current_mode != MMEDrawMode::Undefined) {
+ FlushMMEInlineDraw();
+ }
+ CallMethod(method_call);
+ }
+}
+
+void Maxwell3D::FlushMMEInlineDraw() {
+ LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
+ regs.vertex_buffer.count);
+ ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
+ ASSERT(mme_draw.instance_count == mme_draw.gl_end_count);
+
+ auto debug_context = system.GetGPUDebugContext();
+
+ if (debug_context) {
+ debug_context->OnEvent(Tegra::DebugContext::Event::IncomingPrimitiveBatch, nullptr);
+ }
+
+ // Both instance configuration registers can not be set at the same time.
+ ASSERT_MSG(!regs.draw.instance_next || !regs.draw.instance_cont,
+ "Illegal combination of instancing parameters");
+
+ const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;
+ if (ShouldExecute()) {
+ rasterizer.DrawMultiBatch(is_indexed);
+ }
+
+ if (debug_context) {
+ debug_context->OnEvent(Tegra::DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+ }
+
+ // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
+ // the game is trying to draw indexed or direct mode. This needs to be verified on HW still -
+ // it's possible that it is incorrect and that there is some other register used to specify the
+ // drawing mode.
+ if (is_indexed) {
+ regs.index_array.count = 0;
+ } else {
+ regs.vertex_buffer.count = 0;
+ }
+ mme_draw.current_mode = MMEDrawMode::Undefined;
+ mme_draw.current_count = 0;
+ mme_draw.instance_count = 0;
+ mme_draw.instance_mode = false;
+ mme_draw.gl_begin_consume = false;
+ mme_draw.gl_end_count = 0;
+}
+
void Maxwell3D::ProcessMacroUpload(u32 data) {
ASSERT_MSG(regs.macros.upload_address < macro_memory.size(),
"upload_address exceeded macro_memory size!");
@@ -281,7 +521,15 @@ void Maxwell3D::ProcessMacroUpload(u32 data) {
}
void Maxwell3D::ProcessMacroBind(u32 data) {
- macro_offsets[regs.macros.entry] = data;
+ macro_positions[regs.macros.entry++] = data;
+}
+
+void Maxwell3D::ProcessFirmwareCall4() {
+ LOG_WARNING(HW_GPU, "(STUBBED) called");
+
+ // Firmware call 4 is a blob that changes some registers depending on its parameters.
+ // These registers don't affect emulation and so are stubbed by setting 0xd00 to 1.
+ regs.reg_array[0xd00] = 1;
}
void Maxwell3D::ProcessQueryGet() {
@@ -302,6 +550,7 @@ void Maxwell3D::ProcessQueryGet() {
result = regs.query.query_sequence;
break;
default:
+ result = 1;
UNIMPLEMENTED_MSG("Unimplemented query select type {}",
static_cast<u32>(regs.query.query_get.select.Value()));
}
@@ -333,7 +582,6 @@ void Maxwell3D::ProcessQueryGet() {
query_result.timestamp = system.CoreTiming().GetTicks();
memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
}
- dirty_flags.OnMemoryWrite();
break;
}
default:
@@ -342,16 +590,56 @@ void Maxwell3D::ProcessQueryGet() {
}
}
+void Maxwell3D::ProcessQueryCondition() {
+ const GPUVAddr condition_address{regs.condition.Address()};
+ switch (regs.condition.mode) {
+ case Regs::ConditionMode::Always: {
+ execute_on = true;
+ break;
+ }
+ case Regs::ConditionMode::Never: {
+ execute_on = false;
+ break;
+ }
+ case Regs::ConditionMode::ResNonZero: {
+ Regs::QueryCompare cmp;
+ memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+ execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
+ break;
+ }
+ case Regs::ConditionMode::Equal: {
+ Regs::QueryCompare cmp;
+ memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+ execute_on =
+ cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
+ break;
+ }
+ case Regs::ConditionMode::NotEqual: {
+ Regs::QueryCompare cmp;
+ memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+ execute_on =
+ cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
+ break;
+ }
+ default: {
+ UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
+ execute_on = true;
+ break;
+ }
+ }
+}
+
void Maxwell3D::ProcessSyncPoint() {
const u32 sync_point = regs.sync_info.sync_point.Value();
const u32 increment = regs.sync_info.increment.Value();
- const u32 cache_flush = regs.sync_info.unknown.Value();
- LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
- cache_flush);
+ [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
+ if (increment) {
+ system.GPU().IncrementSyncPoint(sync_point);
+ }
}
void Maxwell3D::DrawArrays() {
- LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
+ LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
regs.vertex_buffer.count);
ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
@@ -374,7 +662,9 @@ void Maxwell3D::DrawArrays() {
}
const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
- rasterizer.AccelerateDrawBatch(is_indexed);
+ if (ShouldExecute()) {
+ rasterizer.DrawBatch(is_indexed);
+ }
if (debug_context) {
debug_context->OnEvent(Tegra::DebugContext::Event::FinishedPrimitiveBatch, nullptr);
@@ -396,34 +686,48 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
auto& bind_data = regs.cb_bind[static_cast<std::size_t>(stage)];
- auto& buffer = shader.const_buffers[bind_data.index];
-
ASSERT(bind_data.index < Regs::MaxConstBuffers);
+ auto& buffer = shader.const_buffers[bind_data.index];
buffer.enabled = bind_data.valid.Value() != 0;
- buffer.index = bind_data.index;
buffer.address = regs.const_buffer.BufferAddress();
buffer.size = regs.const_buffer.cb_size;
}
void Maxwell3D::ProcessCBData(u32 value) {
+ const u32 id = cb_data_state.id;
+ cb_data_state.buffer[id][cb_data_state.counter] = value;
+ // Increment the current buffer position.
+ regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+ cb_data_state.counter++;
+}
+
+void Maxwell3D::StartCBData(u32 method) {
+ constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+ cb_data_state.start_pos = regs.const_buffer.cb_pos;
+ cb_data_state.id = method - first_cb_data;
+ cb_data_state.current = method;
+ cb_data_state.counter = 0;
+ ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
+}
+
+void Maxwell3D::FinishCBData() {
// Write the input value to the current const buffer at the current position.
const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
ASSERT(buffer_address != 0);
// Don't allow writing past the end of the buffer.
- ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size);
-
- const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos};
+ ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size);
- u8* ptr{memory_manager.GetPointer(address)};
- rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32));
- memory_manager.Write<u32>(address, value);
+ const GPUVAddr address{buffer_address + cb_data_state.start_pos};
+ const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos;
- dirty_flags.OnMemoryWrite();
+ const u32 id = cb_data_state.id;
+ memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
+ dirty.OnMemoryWrite();
- // Increment the current buffer position.
- regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+ cb_data_state.id = null_cb_data;
+ cb_data_state.current = null_cb_data;
}
Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
@@ -432,14 +736,10 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
Texture::TICEntry tic_entry;
memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
- ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear ||
- tic_entry.header_version == Texture::TICHeaderVersion::Pitch,
- "TIC versions other than BlockLinear or Pitch are unimplemented");
-
- const auto r_type = tic_entry.r_type.Value();
- const auto g_type = tic_entry.g_type.Value();
- const auto b_type = tic_entry.b_type.Value();
- const auto a_type = tic_entry.a_type.Value();
+ [[maybe_unused]] const auto r_type{tic_entry.r_type.Value()};
+ [[maybe_unused]] const auto g_type{tic_entry.g_type.Value()};
+ [[maybe_unused]] const auto b_type{tic_entry.b_type.Value()};
+ [[maybe_unused]] const auto a_type{tic_entry.a_type.Value()};
// TODO(Subv): Different data types for separate components are not supported
DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index f342c78e6..4c97759ed 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -15,6 +15,7 @@
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "common/math_util.h"
+#include "video_core/engines/const_buffer_info.h"
#include "video_core/engines/engine_upload.h"
#include "video_core/gpu.h"
#include "video_core/macro_interpreter.h"
@@ -61,11 +62,13 @@ public:
static constexpr std::size_t NumVertexAttributes = 32;
static constexpr std::size_t NumVaryings = 31;
static constexpr std::size_t NumTextureSamplers = 32;
+ static constexpr std::size_t NumImages = 8; // TODO(Rodrigo): Investigate this number
static constexpr std::size_t NumClipDistances = 8;
static constexpr std::size_t MaxShaderProgram = 6;
static constexpr std::size_t MaxShaderStage = 5;
// Maximum number of const buffers per shader stage.
static constexpr std::size_t MaxConstBuffers = 18;
+ static constexpr std::size_t MaxConstBufferSize = 0x10000;
enum class QueryMode : u32 {
Write = 0,
@@ -88,6 +91,20 @@ public:
enum class QuerySelect : u32 {
Zero = 0,
+ TimeElapsed = 2,
+ TransformFeedbackPrimitivesGenerated = 11,
+ PrimitivesGenerated = 18,
+ SamplesPassed = 21,
+ TransformFeedbackUnknown = 26,
+ };
+
+ struct QueryCompare {
+ u32 initial_sequence;
+ u32 initial_mode;
+ u32 unknown1;
+ u32 unknown2;
+ u32 current_sequence;
+ u32 current_mode;
};
enum class QuerySyncCondition : u32 {
@@ -95,6 +112,14 @@ public:
GreaterThan = 1,
};
+ enum class ConditionMode : u32 {
+ Never = 0,
+ Always = 1,
+ ResNonZero = 2,
+ Equal = 3,
+ NotEqual = 4,
+ };
+
enum class ShaderProgram : u32 {
VertexA = 0,
VertexB = 1,
@@ -786,8 +811,9 @@ public:
INSERT_PADDING_WORDS(0x21);
u32 vb_element_base;
+ u32 vb_base_instance;
- INSERT_PADDING_WORDS(0x36);
+ INSERT_PADDING_WORDS(0x35);
union {
BitField<0, 1, u32> c0;
@@ -813,7 +839,18 @@ public:
BitField<4, 1, u32> alpha_to_one;
} multisample_control;
- INSERT_PADDING_WORDS(0x7);
+ INSERT_PADDING_WORDS(0x4);
+
+ struct {
+ u32 address_high;
+ u32 address_low;
+ ConditionMode mode;
+
+ GPUVAddr Address() const {
+ return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+ address_low);
+ }
+ } condition;
struct {
u32 tsc_address_high;
@@ -1053,7 +1090,9 @@ public:
INSERT_PADDING_WORDS(14);
} shader_config[MaxShaderProgram];
- INSERT_PADDING_WORDS(0x80);
+ INSERT_PADDING_WORDS(0x60);
+
+ u32 firmware[0x20];
struct {
u32 cb_size;
@@ -1112,13 +1151,6 @@ public:
static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable");
struct State {
- struct ConstBufferInfo {
- GPUVAddr address;
- u32 index;
- u32 size;
- bool enabled;
- };
-
struct ShaderStageInfo {
std::array<ConstBufferInfo, Regs::MaxConstBuffers> const_buffers;
};
@@ -1129,23 +1161,77 @@ public:
State state{};
- struct DirtyFlags {
- std::bitset<8> color_buffer{0xFF};
- std::bitset<32> vertex_array{0xFFFFFFFF};
+ struct DirtyRegs {
+ static constexpr std::size_t NUM_REGS = 256;
+ union {
+ struct {
+ bool null_dirty;
+
+ // Vertex Attributes
+ bool vertex_attrib_format;
+
+ // Vertex Arrays
+ std::array<bool, 32> vertex_array;
+
+ bool vertex_array_buffers;
+
+ // Vertex Instances
+ std::array<bool, 32> vertex_instance;
+
+ bool vertex_instances;
+
+ // Render Targets
+ std::array<bool, 8> render_target;
+ bool depth_buffer;
+
+ bool render_settings;
- bool vertex_attrib_format = true;
- bool zeta_buffer = true;
- bool shaders = true;
+ // Shaders
+ bool shaders;
+
+ // Rasterizer State
+ bool viewport;
+ bool clip_coefficient;
+ bool cull_mode;
+ bool primitive_restart;
+ bool depth_test;
+ bool stencil_test;
+ bool blend_state;
+ bool scissor_test;
+ bool transform_feedback;
+ bool color_mask;
+ bool polygon_offset;
+
+ // Complementary
+ bool viewport_transform;
+ bool screen_y_control;
+
+ bool memory_general;
+ };
+ std::array<bool, NUM_REGS> regs;
+ };
+
+ void ResetVertexArrays() {
+ vertex_array.fill(true);
+ vertex_array_buffers = true;
+ }
+
+ void ResetRenderTargets() {
+ depth_buffer = true;
+ render_target.fill(true);
+ render_settings = true;
+ }
void OnMemoryWrite() {
- zeta_buffer = true;
shaders = true;
- color_buffer.set();
- vertex_array.set();
+ memory_general = true;
+ ResetRenderTargets();
+ ResetVertexArrays();
}
- };
- DirtyFlags dirty_flags;
+ } dirty{};
+
+ std::array<u8, Regs::NUM_REGS> dirty_pointers{};
/// Reads a register value located at the input method address
u32 GetRegisterValue(u32 method) const;
@@ -1153,6 +1239,11 @@ public:
/// Write the value to the register identified by method.
void CallMethod(const GPU::MethodCall& method_call);
+ /// Write the value to the register identified by method.
+ void CallMethodFromMME(const GPU::MethodCall& method_call);
+
+ void FlushMMEInlineDraw();
+
/// Given a Texture Handle, returns the TSC and TIC entries.
Texture::FullTextureInfo GetTextureInfo(const Texture::TextureHandle tex_handle,
std::size_t offset) const;
@@ -1174,6 +1265,25 @@ public:
return macro_memory;
}
+ bool ShouldExecute() const {
+ return execute_on;
+ }
+
+ enum class MMEDrawMode : u32 {
+ Undefined,
+ Array,
+ Indexed,
+ };
+
+ struct MMEDrawState {
+ MMEDrawMode current_mode{MMEDrawMode::Undefined};
+ u32 current_count{};
+ u32 instance_count{};
+ bool instance_mode{};
+ bool gl_begin_consume{};
+ u32 gl_end_count{};
+ } mme_draw;
+
private:
void InitializeRegisterDefaults();
@@ -1184,7 +1294,9 @@ private:
MemoryManager& memory_manager;
/// Start offsets of each macro in macro_memory
- std::unordered_map<u32, u32> macro_offsets;
+ std::array<u32, 0x80> macro_positions = {};
+
+ std::array<bool, Regs::NUM_REGS> mme_inline{};
/// Memory for macro code
MacroMemory macro_memory;
@@ -1197,20 +1309,34 @@ private:
/// Interpreter for the macro codes uploaded to the GPU.
MacroInterpreter macro_interpreter;
+ static constexpr u32 null_cb_data = 0xFFFFFFFF;
+ struct {
+ std::array<std::array<u32, 0x4000>, 16> buffer;
+ u32 current{null_cb_data};
+ u32 id{null_cb_data};
+ u32 start_pos{};
+ u32 counter{};
+ } cb_data_state;
+
Upload::State upload_state;
+ bool execute_on{true};
+
/// Retrieves information about a specific TIC entry from the TIC buffer.
Texture::TICEntry GetTICEntry(u32 tic_index) const;
/// Retrieves information about a specific TSC entry from the TSC buffer.
Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;
+ void InitDirtySettings();
+
/**
* Call a macro on this engine.
* @param method Method to call
+ * @param num_parameters Number of arguments
* @param parameters Arguments to the method call
*/
- void CallMacroMethod(u32 method, std::vector<u32> parameters);
+ void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters);
/// Handles writes to the macro uploading register.
void ProcessMacroUpload(u32 data);
@@ -1218,23 +1344,34 @@ private:
/// Handles writes to the macro bind register.
void ProcessMacroBind(u32 data);
+ /// Handles firmware blob 4
+ void ProcessFirmwareCall4();
+
/// Handles a write to the CLEAR_BUFFERS register.
void ProcessClearBuffers();
/// Handles a write to the QUERY_GET register.
void ProcessQueryGet();
+ // Handles Conditional Rendering
+ void ProcessQueryCondition();
+
/// Handles writes to syncing register.
void ProcessSyncPoint();
/// Handles a write to the CB_DATA[i] register.
+ void StartCBData(u32 method);
void ProcessCBData(u32 value);
+ void FinishCBData();
/// Handles a write to the CB_BIND register.
void ProcessCBBind(Regs::ShaderStage stage);
/// Handles a write to the VERTEX_END_GL register, triggering a draw.
void DrawArrays();
+
+ // Handles a instance drawcall from MME
+ void StepInstance(MMEDrawMode expected_mode, u32 count);
};
#define ASSERT_REG_POSITION(field_name, position) \
@@ -1291,10 +1428,12 @@ ASSERT_REG_POSITION(stencil_front_mask, 0x4E7);
ASSERT_REG_POSITION(frag_color_clamp, 0x4EA);
ASSERT_REG_POSITION(screen_y_control, 0x4EB);
ASSERT_REG_POSITION(vb_element_base, 0x50D);
+ASSERT_REG_POSITION(vb_base_instance, 0x50E);
ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
ASSERT_REG_POSITION(point_size, 0x546);
ASSERT_REG_POSITION(zeta_enable, 0x54E);
ASSERT_REG_POSITION(multisample_control, 0x54F);
+ASSERT_REG_POSITION(condition, 0x554);
ASSERT_REG_POSITION(tsc, 0x557);
ASSERT_REG_POSITION(polygon_offset_factor, 0x55b);
ASSERT_REG_POSITION(tic, 0x55D);
@@ -1324,6 +1463,7 @@ ASSERT_REG_POSITION(vertex_array[0], 0x700);
ASSERT_REG_POSITION(independent_blend, 0x780);
ASSERT_REG_POSITION(vertex_array_limit[0], 0x7C0);
ASSERT_REG_POSITION(shader_config[0], 0x800);
+ASSERT_REG_POSITION(firmware, 0x8C0);
ASSERT_REG_POSITION(const_buffer, 0x8E0);
ASSERT_REG_POSITION(cb_bind[0], 0x904);
ASSERT_REG_POSITION(tex_cb_index, 0x982);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 3a5dfef0c..ad8453c5f 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -5,18 +5,17 @@
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
+#include "core/settings.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/maxwell_dma.h"
#include "video_core/memory_manager.h"
-#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
#include "video_core/textures/decoders.h"
namespace Tegra::Engines {
-MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
- MemoryManager& memory_manager)
- : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
+MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)
+ : system{system}, memory_manager{memory_manager} {}
void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
ASSERT_MSG(method_call.method < Regs::NUM_REGS,
@@ -38,7 +37,7 @@ void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
}
void MaxwellDMA::HandleCopy() {
- LOG_WARNING(HW_GPU, "Requested a DMA copy");
+ LOG_TRACE(HW_GPU, "Requested a DMA copy");
const GPUVAddr source = regs.src_address.Address();
const GPUVAddr dest = regs.dst_address.Address();
@@ -58,7 +57,7 @@ void MaxwellDMA::HandleCopy() {
}
// All copies here update the main memory, so mark all rasterizer states as invalid.
- system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+ system.GPU().Maxwell3D().dirty.OnMemoryWrite();
if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
// When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
@@ -84,13 +83,17 @@ void MaxwellDMA::HandleCopy() {
ASSERT(regs.exec.enable_2d == 1);
if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
- ASSERT(regs.src_params.size_z == 1);
+ ASSERT(regs.src_params.BlockDepth() == 0);
// If the input is tiled and the output is linear, deswizzle the input and copy it over.
- const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+ const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count;
const std::size_t src_size = Texture::CalculateSize(
- true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
+ true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
+ const std::size_t src_layer_size = Texture::CalculateSize(
+ true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1,
+ regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
+
const std::size_t dst_size = regs.dst_pitch * regs.y_count;
if (read_buffer.size() < src_size) {
@@ -104,23 +107,23 @@ void MaxwellDMA::HandleCopy() {
memory_manager.ReadBlock(source, read_buffer.data(), src_size);
memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
- Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
- regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(),
- write_buffer.data(), regs.src_params.BlockHeight(),
- regs.src_params.pos_x, regs.src_params.pos_y);
+ Texture::UnswizzleSubrect(
+ regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel,
+ read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(),
+ regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y);
memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
} else {
- ASSERT(regs.dst_params.BlockDepth() == 1);
+ ASSERT(regs.dst_params.BlockDepth() == 0);
- const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count;
+ const u32 bytes_per_pixel = regs.src_pitch / regs.x_count;
const std::size_t dst_size = Texture::CalculateSize(
- true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
+ true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
const std::size_t dst_layer_size = Texture::CalculateSize(
- true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
+ true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
const std::size_t src_size = regs.src_pitch * regs.y_count;
@@ -133,14 +136,19 @@ void MaxwellDMA::HandleCopy() {
write_buffer.resize(dst_size);
}
- memory_manager.ReadBlock(source, read_buffer.data(), src_size);
- memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
+ if (Settings::values.use_accurate_gpu_emulation) {
+ memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+ memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
+ } else {
+ memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size);
+ memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size);
+ }
// If the input is linear and the output is tiled, swizzle the input and copy it over.
- Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
- src_bytes_per_pixel,
- write_buffer.data() + dst_layer_size * regs.dst_params.pos_z,
- read_buffer.data(), regs.dst_params.BlockHeight());
+ Texture::SwizzleSubrect(
+ regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel,
+ write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(),
+ regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y);
memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
}
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index e5942f671..93808a9bb 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -20,10 +20,6 @@ namespace Tegra {
class MemoryManager;
}
-namespace VideoCore {
-class RasterizerInterface;
-}
-
namespace Tegra::Engines {
/**
@@ -33,8 +29,7 @@ namespace Tegra::Engines {
class MaxwellDMA final {
public:
- explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
- MemoryManager& memory_manager);
+ explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
~MaxwellDMA() = default;
/// Write the value to the register identified by method.
@@ -59,11 +54,11 @@ public:
};
u32 BlockHeight() const {
- return 1 << block_height;
+ return block_height.Value();
}
u32 BlockDepth() const {
- return 1 << block_depth;
+ return block_depth.Value();
}
};
@@ -180,8 +175,6 @@ public:
private:
Core::System& system;
- VideoCore::RasterizerInterface& rasterizer;
-
MemoryManager& memory_manager;
std::vector<u8> read_buffer;
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index ffb3ec3e0..7a6355ce2 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -4,6 +4,7 @@
#pragma once
+#include <array>
#include <bitset>
#include <optional>
#include <tuple>
@@ -77,7 +78,7 @@ union Attribute {
constexpr explicit Attribute(u64 value) : value(value) {}
enum class Index : u64 {
- PointSize = 6,
+ LayerViewportPointSize = 6,
Position = 7,
Attribute_0 = 8,
Attribute_31 = 39,
@@ -126,6 +127,15 @@ union Sampler {
u64 value{};
};
+union Image {
+ Image() = default;
+
+ constexpr explicit Image(u64 value) : value{value} {}
+
+ BitField<36, 13, u64> index;
+ u64 value;
+};
+
} // namespace Tegra::Shader
namespace std {
@@ -344,6 +354,26 @@ enum class TextureMiscMode : u64 {
PTP,
};
+enum class SurfaceDataMode : u64 {
+ P = 0,
+ D_BA = 1,
+};
+
+enum class OutOfBoundsStore : u64 {
+ Ignore = 0,
+ Clamp = 1,
+ Trap = 2,
+};
+
+enum class ImageType : u64 {
+ Texture1D = 0,
+ TextureBuffer = 1,
+ Texture1DArray = 2,
+ Texture2D = 3,
+ Texture2DArray = 4,
+ Texture3D = 5,
+};
+
enum class IsberdMode : u64 {
None = 0,
Patch = 1,
@@ -398,7 +428,7 @@ enum class LmemLoadCacheManagement : u64 {
CV = 3,
};
-enum class LmemStoreCacheManagement : u64 {
+enum class StoreCacheManagement : u64 {
Default = 0,
CG = 1,
CS = 2,
@@ -508,6 +538,41 @@ enum class PhysicalAttributeDirection : u64 {
Output = 1,
};
+enum class VoteOperation : u64 {
+ All = 0, // allThreadsNV
+ Any = 1, // anyThreadNV
+ Eq = 2, // allThreadsEqualNV
+};
+
+enum class ImageAtomicOperationType : u64 {
+ U32 = 0,
+ S32 = 1,
+ U64 = 2,
+ F32 = 3,
+ S64 = 5,
+ SD32 = 6,
+ SD64 = 7,
+};
+
+enum class ImageAtomicOperation : u64 {
+ Add = 0,
+ Min = 1,
+ Max = 2,
+ Inc = 3,
+ Dec = 4,
+ And = 5,
+ Or = 6,
+ Xor = 7,
+ Exch = 8,
+};
+
+enum class ShuffleOperation : u64 {
+ Idx = 0, // shuffleNV
+ Up = 1, // shuffleUpNV
+ Down = 2, // shuffleDownNV
+ Bfly = 3, // shuffleXorNV
+};
+
union Instruction {
Instruction& operator=(const Instruction& instr) {
value = instr.value;
@@ -530,6 +595,27 @@ union Instruction {
BitField<48, 16, u64> opcode;
union {
+ BitField<8, 5, ConditionCode> cc;
+ BitField<13, 1, u64> trigger;
+ } nop;
+
+ union {
+ BitField<48, 2, VoteOperation> operation;
+ BitField<45, 3, u64> dest_pred;
+ BitField<39, 3, u64> value;
+ BitField<42, 1, u64> negate_value;
+ } vote;
+
+ union {
+ BitField<30, 2, ShuffleOperation> operation;
+ BitField<48, 3, u64> pred48;
+ BitField<28, 1, u64> is_index_imm;
+ BitField<29, 1, u64> is_mask_imm;
+ BitField<20, 5, u64> index_imm;
+ BitField<34, 13, u64> mask_imm;
+ } shfl;
+
+ union {
BitField<8, 8, Register> gpr;
BitField<20, 24, s64> offset;
} gmem;
@@ -627,6 +713,10 @@ union Instruction {
} shift;
union {
+ BitField<39, 1, u64> wrap;
+ } shr;
+
+ union {
BitField<39, 5, u64> shift_amount;
BitField<48, 1, u64> negate_b;
BitField<49, 1, u64> negate_a;
@@ -811,7 +901,7 @@ union Instruction {
} ld_l;
union {
- BitField<44, 2, LmemStoreCacheManagement> cache_management;
+ BitField<44, 2, StoreCacheManagement> cache_management;
} st_l;
union {
@@ -838,6 +928,7 @@ union Instruction {
union {
BitField<0, 3, u64> pred0;
BitField<3, 3, u64> pred3;
+ BitField<6, 1, u64> neg_b;
BitField<7, 1, u64> abs_a;
BitField<39, 3, u64> pred39;
BitField<42, 1, u64> neg_pred;
@@ -859,6 +950,11 @@ union Instruction {
} isetp;
union {
+ BitField<48, 1, u64> is_signed;
+ BitField<49, 3, PredCondition> cond;
+ } icmp;
+
+ union {
BitField<0, 3, u64> pred0;
BitField<3, 3, u64> pred3;
BitField<12, 3, u64> pred12;
@@ -901,8 +997,6 @@ union Instruction {
} csetp;
union {
- BitField<35, 4, PredCondition> cond;
- BitField<49, 1, u64> h_and;
BitField<6, 1, u64> ftz;
BitField<45, 2, PredOperation> op;
BitField<3, 3, u64> pred3;
@@ -910,9 +1004,21 @@ union Instruction {
BitField<43, 1, u64> negate_a;
BitField<44, 1, u64> abs_a;
BitField<47, 2, HalfType> type_a;
- BitField<31, 1, u64> negate_b;
- BitField<30, 1, u64> abs_b;
- BitField<28, 2, HalfType> type_b;
+ union {
+ BitField<35, 4, PredCondition> cond;
+ BitField<49, 1, u64> h_and;
+ BitField<31, 1, u64> negate_b;
+ BitField<30, 1, u64> abs_b;
+ BitField<28, 2, HalfType> type_b;
+ } reg;
+ union {
+ BitField<56, 1, u64> negate_b;
+ BitField<54, 1, u64> abs_b;
+ } cbuf;
+ union {
+ BitField<49, 4, PredCondition> cond;
+ BitField<53, 1, u64> h_and;
+ } cbuf_and_imm;
BitField<42, 1, u64> neg_pred;
BitField<39, 3, u64> pred39;
} hsetp2;
@@ -961,7 +1067,6 @@ union Instruction {
} iset;
union {
- BitField<41, 2, u64> selector; // i2i and i2f only
BitField<45, 1, u64> negate_a;
BitField<49, 1, u64> abs_a;
BitField<10, 2, Register::Size> src_size;
@@ -978,8 +1083,6 @@ union Instruction {
} f2i;
union {
- BitField<8, 2, Register::Size> src_size;
- BitField<10, 2, Register::Size> dst_size;
BitField<39, 4, u64> rounding;
// H0, H1 extract for F16 missing
BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
@@ -989,6 +1092,13 @@ union Instruction {
}
} f2f;
+ union {
+ BitField<41, 2, u64> selector;
+ } int_src;
+
+ union {
+ BitField<41, 1, u64> selector;
+ } float_src;
} conversion;
union {
@@ -1232,8 +1342,23 @@ union Instruction {
} texs;
union {
+ BitField<28, 1, u64> is_array;
+ BitField<29, 2, TextureType> texture_type;
+ BitField<35, 1, u64> aoffi;
+ BitField<49, 1, u64> nodep_flag;
+ BitField<50, 1, u64> ms; // Multisample?
+ BitField<54, 1, u64> cl;
+ BitField<55, 1, u64> process_mode;
+
+ TextureProcessMode GetTextureProcessMode() const {
+ return process_mode == 0 ? TextureProcessMode::LZ : TextureProcessMode::LL;
+ }
+ } tld;
+
+ union {
BitField<49, 1, u64> nodep_flag;
BitField<53, 4, u64> texture_info;
+ BitField<59, 1, u64> fp32_flag;
TextureType GetTextureType() const {
// The TLDS instruction has a weird encoding for the texture type.
@@ -1281,6 +1406,43 @@ union Instruction {
} tlds;
union {
+ BitField<24, 2, StoreCacheManagement> cache_management;
+ BitField<33, 3, ImageType> image_type;
+ BitField<49, 2, OutOfBoundsStore> out_of_bounds_store;
+ BitField<51, 1, u64> is_immediate;
+ BitField<52, 1, SurfaceDataMode> mode;
+
+ BitField<20, 3, StoreType> store_data_layout;
+ BitField<20, 4, u64> component_mask_selector;
+
+ bool IsComponentEnabled(std::size_t component) const {
+ ASSERT(mode == SurfaceDataMode::P);
+ constexpr u8 R = 0b0001;
+ constexpr u8 G = 0b0010;
+ constexpr u8 B = 0b0100;
+ constexpr u8 A = 0b1000;
+ constexpr std::array<u8, 16> mask = {
+ 0, (R), (G), (R | G), (B), (R | B),
+ (G | B), (R | G | B), (A), (R | A), (G | A), (R | G | A),
+ (B | A), (R | B | A), (G | B | A), (R | G | B | A)};
+ return std::bitset<4>{mask.at(component_mask_selector)}.test(component);
+ }
+
+ StoreType GetStoreDataLayout() const {
+ ASSERT(mode == SurfaceDataMode::D_BA);
+ return store_data_layout;
+ }
+ } suldst;
+
+ union {
+ BitField<28, 1, u64> is_ba;
+ BitField<51, 3, ImageAtomicOperationType> operation_type;
+ BitField<33, 3, ImageType> image_type;
+ BitField<29, 4, ImageAtomicOperation> operation;
+ BitField<49, 2, OutOfBoundsStore> out_of_bounds_store;
+ } suatom_d;
+
+ union {
BitField<20, 24, u64> target;
BitField<5, 1, u64> constant_buffer;
@@ -1295,6 +1457,20 @@ union Instruction {
} bra;
union {
+ BitField<20, 24, u64> target;
+ BitField<5, 1, u64> constant_buffer;
+
+ s32 GetBranchExtend() const {
+ // Sign extend the branch target offset
+ u32 mask = 1U << (24 - 1);
+ u32 value = static_cast<u32>(target);
+ // The branch offset is relative to the next instruction and is stored in bytes, so
+ // divide it by the size of an instruction and add 1 to it.
+ return static_cast<s32>((value ^ mask) - mask) / sizeof(Instruction) + 1;
+ }
+ } brx;
+
+ union {
BitField<39, 1, u64> emit; // EmitVertex
BitField<40, 1, u64> cut; // EndPrimitive
} out;
@@ -1371,6 +1547,7 @@ union Instruction {
Attribute attribute;
Sampler sampler;
+ Image image;
u64 value;
};
@@ -1385,11 +1562,14 @@ public:
SYNC,
BRK,
DEPBAR,
+ VOTE,
+ SHFL,
BFE_C,
BFE_R,
BFE_IMM,
BFI_IMM_R,
BRA,
+ BRX,
PBK,
LD_A,
LD_L,
@@ -1408,12 +1588,17 @@ public:
TXQ, // Texture Query
TXQ_B, // Texture Query Bindless
TEXS, // Texture Fetch with scalar/non-vec4 source/destinations
+ TLD, // Texture Load
TLDS, // Texture Load with scalar/non-vec4 source/destinations
TLD4, // Texture Load 4
TLD4S, // Texture Load 4 with scalar / non - vec4 source / destinations
TMML_B, // Texture Mip Map Level
TMML, // Texture Mip Map Level
+ SUST, // Surface Store
+ SULD, // Surface Load
+ SUATOM, // Surface Atomic Operation
EXIT,
+ NOP,
IPA,
OUT_R, // Emit vertex/primitive
ISBERD,
@@ -1456,7 +1641,9 @@ public:
HFMA2_RC,
HFMA2_RR,
HFMA2_IMM_R,
+ HSETP2_C,
HSETP2_R,
+ HSETP2_IMM,
HSET2_R,
POPC_C,
POPC_R,
@@ -1464,6 +1651,10 @@ public:
SEL_C,
SEL_R,
SEL_IMM,
+ ICMP_RC,
+ ICMP_R,
+ ICMP_CR,
+ ICMP_IMM,
MUFU, // Multi-Function Operator
RRO_C, // Range Reduction Operator
RRO_R,
@@ -1541,8 +1732,10 @@ public:
Hfma2,
Flow,
Synch,
+ Warp,
Memory,
Texture,
+ Image,
FloatSet,
FloatSetPredicate,
IntegerSet,
@@ -1661,10 +1854,13 @@ private:
INST("111000101001----", Id::SSY, Type::Flow, "SSY"),
INST("111000101010----", Id::PBK, Type::Flow, "PBK"),
INST("111000100100----", Id::BRA, Type::Flow, "BRA"),
+ INST("111000100101----", Id::BRX, Type::Flow, "BRX"),
INST("1111000011111---", Id::SYNC, Type::Flow, "SYNC"),
INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
+ INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
+ INST("1110111100010---", Id::SHFL, Type::Warp, "SHFL"),
INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
@@ -1682,11 +1878,16 @@ private:
INST("1101111101001---", Id::TXQ, Type::Texture, "TXQ"),
INST("1101111101010---", Id::TXQ_B, Type::Texture, "TXQ_B"),
INST("1101-00---------", Id::TEXS, Type::Texture, "TEXS"),
- INST("1101101---------", Id::TLDS, Type::Texture, "TLDS"),
+ INST("11011100--11----", Id::TLD, Type::Texture, "TLD"),
+ INST("1101-01---------", Id::TLDS, Type::Texture, "TLDS"),
INST("110010----111---", Id::TLD4, Type::Texture, "TLD4"),
INST("1101111100------", Id::TLD4S, Type::Texture, "TLD4S"),
INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
INST("1101111101011---", Id::TMML, Type::Texture, "TMML"),
+ INST("11101011001-----", Id::SUST, Type::Image, "SUST"),
+ INST("11101011000-----", Id::SULD, Type::Image, "SULD"),
+ INST("1110101000------", Id::SUATOM, Type::Image, "SUATOM_D"),
+ INST("0101000010110---", Id::NOP, Type::Trivial, "NOP"),
INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),
INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"),
@@ -1720,6 +1921,10 @@ private:
INST("0100110010100---", Id::SEL_C, Type::ArithmeticInteger, "SEL_C"),
INST("0101110010100---", Id::SEL_R, Type::ArithmeticInteger, "SEL_R"),
INST("0011100-10100---", Id::SEL_IMM, Type::ArithmeticInteger, "SEL_IMM"),
+ INST("010100110100----", Id::ICMP_RC, Type::ArithmeticInteger, "ICMP_RC"),
+ INST("010110110100----", Id::ICMP_R, Type::ArithmeticInteger, "ICMP_R"),
+ INST("010010110100----", Id::ICMP_CR, Type::ArithmeticInteger, "ICMP_CR"),
+ INST("0011011-0100----", Id::ICMP_IMM, Type::ArithmeticInteger, "ICMP_IMM"),
INST("0101101111011---", Id::LEA_R2, Type::ArithmeticInteger, "LEA_R2"),
INST("0101101111010---", Id::LEA_R1, Type::ArithmeticInteger, "LEA_R1"),
INST("001101101101----", Id::LEA_IMM, Type::ArithmeticInteger, "LEA_IMM"),
@@ -1735,7 +1940,9 @@ private:
INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"),
INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"),
INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"),
- INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"),
+ INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
+ INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
+ INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),