summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/CMakeLists.txt42
-rw-r--r--src/video_core/clipper.cpp24
-rw-r--r--src/video_core/command_processor.cpp43
-rw-r--r--src/video_core/debug_utils/debug_utils.cpp256
-rw-r--r--src/video_core/debug_utils/debug_utils.h25
-rw-r--r--src/video_core/pica.cpp2
-rw-r--r--src/video_core/pica.h62
-rw-r--r--src/video_core/pica_state.h4
-rw-r--r--src/video_core/primitive_assembly.cpp2
-rw-r--r--src/video_core/primitive_assembly.h5
-rw-r--r--src/video_core/rasterizer.cpp12
-rw-r--r--src/video_core/rasterizer.h40
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp7
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h4
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer_cache.cpp28
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer_cache.h7
-rw-r--r--src/video_core/renderer_opengl/gl_shader_gen.cpp19
-rw-r--r--src/video_core/shader/shader.cpp139
-rw-r--r--src/video_core/shader/shader.h120
-rw-r--r--src/video_core/shader/shader_interpreter.cpp51
-rw-r--r--src/video_core/shader/shader_interpreter.h25
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp890
-rw-r--r--src/video_core/shader/shader_jit_x64.h115
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.cpp889
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.h124
-rw-r--r--src/video_core/texture/etc1.cpp124
-rw-r--r--src/video_core/texture/etc1.h16
-rw-r--r--src/video_core/texture/texture_decode.cpp229
-rw-r--r--src/video_core/texture/texture_decode.h60
-rw-r--r--src/video_core/vertex_loader.cpp5
-rw-r--r--src/video_core/vertex_loader.h4
-rw-r--r--src/video_core/video_core.cpp1
-rw-r--r--src/video_core/video_core.h1
33 files changed, 1811 insertions, 1564 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 6ca319b59..ad984cd94 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,36 +1,30 @@
set(SRCS
- renderer_opengl/gl_rasterizer.cpp
- renderer_opengl/gl_rasterizer_cache.cpp
- renderer_opengl/gl_shader_gen.cpp
- renderer_opengl/gl_shader_util.cpp
- renderer_opengl/gl_state.cpp
- renderer_opengl/renderer_opengl.cpp
- debug_utils/debug_utils.cpp
clipper.cpp
command_processor.cpp
+ debug_utils/debug_utils.cpp
pica.cpp
primitive_assembly.cpp
rasterizer.cpp
renderer_base.cpp
+ renderer_opengl/gl_rasterizer.cpp
+ renderer_opengl/gl_rasterizer_cache.cpp
+ renderer_opengl/gl_shader_gen.cpp
+ renderer_opengl/gl_shader_util.cpp
+ renderer_opengl/gl_state.cpp
+ renderer_opengl/renderer_opengl.cpp
shader/shader.cpp
shader/shader_interpreter.cpp
swrasterizer.cpp
+ texture/etc1.cpp
+ texture/texture_decode.cpp
vertex_loader.cpp
video_core.cpp
)
set(HEADERS
- debug_utils/debug_utils.h
- renderer_opengl/gl_rasterizer.h
- renderer_opengl/gl_rasterizer_cache.h
- renderer_opengl/gl_resource_manager.h
- renderer_opengl/gl_shader_gen.h
- renderer_opengl/gl_shader_util.h
- renderer_opengl/gl_state.h
- renderer_opengl/pica_to_gl.h
- renderer_opengl/renderer_opengl.h
clipper.h
command_processor.h
+ debug_utils/debug_utils.h
gpu_debugger.h
pica.h
pica_state.h
@@ -39,10 +33,20 @@ set(HEADERS
rasterizer.h
rasterizer_interface.h
renderer_base.h
+ renderer_opengl/gl_rasterizer.h
+ renderer_opengl/gl_rasterizer_cache.h
+ renderer_opengl/gl_resource_manager.h
+ renderer_opengl/gl_shader_gen.h
+ renderer_opengl/gl_shader_util.h
+ renderer_opengl/gl_state.h
+ renderer_opengl/pica_to_gl.h
+ renderer_opengl/renderer_opengl.h
shader/debug_data.h
shader/shader.h
shader/shader_interpreter.h
swrasterizer.h
+ texture/etc1.h
+ texture/texture_decode.h
utils.h
vertex_loader.h
video_core.h
@@ -50,10 +54,12 @@ set(HEADERS
if(ARCHITECTURE_x86_64)
set(SRCS ${SRCS}
- shader/shader_jit_x64.cpp)
+ shader/shader_jit_x64.cpp
+ shader/shader_jit_x64_compiler.cpp)
set(HEADERS ${HEADERS}
- shader/shader_jit_x64.h)
+ shader/shader_jit_x64.h
+ shader/shader_jit_x64_compiler.h)
endif()
create_directory_groups(${SRCS} ${HEADERS})
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 05b5cea73..0774ffc53 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -18,6 +18,8 @@
#include "video_core/rasterizer.h"
#include "video_core/shader/shader.h"
+using Pica::Rasterizer::Vertex;
+
namespace Pica {
namespace Clipper {
@@ -29,20 +31,20 @@ public:
float24::FromFloat32(0), float24::FromFloat32(0)))
: coeffs(coeffs), bias(bias) {}
- bool IsInside(const OutputVertex& vertex) const {
+ bool IsInside(const Vertex& vertex) const {
return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
}
- bool IsOutSide(const OutputVertex& vertex) const {
+ bool IsOutSide(const Vertex& vertex) const {
return !IsInside(vertex);
}
- OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
+ Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const {
float24 dp = Math::Dot(v0.pos + bias, coeffs);
float24 dp_prev = Math::Dot(v1.pos + bias, coeffs);
float24 factor = dp_prev / (dp_prev - dp);
- return OutputVertex::Lerp(factor, v0, v1);
+ return Vertex::Lerp(factor, v0, v1);
}
private:
@@ -51,7 +53,7 @@ private:
Math::Vec4<float24> bias;
};
-static void InitScreenCoordinates(OutputVertex& vtx) {
+static void InitScreenCoordinates(Vertex& vtx) {
struct {
float24 halfsize_x;
float24 offset_x;
@@ -91,8 +93,8 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
// introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
// fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
static const size_t MAX_VERTICES = 9;
- static_vector<OutputVertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
- static_vector<OutputVertex, MAX_VERTICES> buffer_b;
+ static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
+ static_vector<Vertex, MAX_VERTICES> buffer_b;
auto* output_list = &buffer_a;
auto* input_list = &buffer_b;
@@ -123,7 +125,7 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
std::swap(input_list, output_list);
output_list->clear();
- const OutputVertex* reference_vertex = &input_list->back();
+ const Vertex* reference_vertex = &input_list->back();
for (const auto& vertex : *input_list) {
// NOTE: This algorithm changes vertex order in some cases!
@@ -148,9 +150,9 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
InitScreenCoordinates((*output_list)[1]);
for (size_t i = 0; i < output_list->size() - 2; i++) {
- OutputVertex& vtx0 = (*output_list)[0];
- OutputVertex& vtx1 = (*output_list)[i + 1];
- OutputVertex& vtx2 = (*output_list)[i + 2];
+ Vertex& vtx0 = (*output_list)[0];
+ Vertex& vtx1 = (*output_list)[i + 1];
+ Vertex& vtx2 = (*output_list)[i + 2];
InitScreenCoordinates(vtx2);
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index ea58e9f54..4955ff9f9 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -125,33 +125,37 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
// TODO: Verify that this actually modifies the register!
if (setup.index < 15) {
- g_state.vs_default_attributes[setup.index] = attribute;
+ g_state.input_default_attributes.attr[setup.index] = attribute;
setup.index++;
} else {
- // Put each attribute into an immediate input buffer.
- // When all specified immediate attributes are present, the Vertex Shader is invoked
- // and everything is
- // sent to the primitive assembler.
+ // Put each attribute into an immediate input buffer. When all specified immediate
+ // attributes are present, the Vertex Shader is invoked and everything is sent to
+ // the primitive assembler.
auto& immediate_input = g_state.immediate.input_vertex;
auto& immediate_attribute_id = g_state.immediate.current_attribute;
- immediate_input.attr[immediate_attribute_id++] = attribute;
+ immediate_input.attr[immediate_attribute_id] = attribute;
- if (immediate_attribute_id >= regs.vs.num_input_attributes + 1) {
+ if (immediate_attribute_id < regs.max_input_attrib_index) {
+ immediate_attribute_id += 1;
+ } else {
MICROPROFILE_SCOPE(GPU_Drawing);
immediate_attribute_id = 0;
- Shader::UnitState shader_unit;
- g_state.vs.Setup();
+ auto* shader_engine = Shader::GetEngine();
+ shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
// Send to vertex shader
if (g_debug_context)
g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
static_cast<void*>(&immediate_input));
- g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes + 1);
- Shader::OutputVertex output_vertex =
- shader_unit.output_registers.ToVertex(regs.vs);
+ Shader::UnitState shader_unit;
+ Shader::AttributeBuffer output{};
+
+ shader_unit.LoadInput(regs.vs, immediate_input);
+ shader_engine->Run(g_state.vs, shader_unit);
+ shader_unit.WriteOutput(regs.vs, output);
// Send to renderer
using Pica::Shader::OutputVertex;
@@ -160,7 +164,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
};
- g_state.primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
+ g_state.primitive_assembler.SubmitVertex(
+ Shader::OutputVertex::FromAttributeBuffer(regs, output), AddTriangle);
}
}
}
@@ -243,8 +248,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
unsigned int vertex_cache_pos = 0;
vertex_cache_ids.fill(-1);
+ auto* shader_engine = Shader::GetEngine();
Shader::UnitState shader_unit;
- g_state.vs.Setup();
+
+ shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
for (unsigned int index = 0; index < regs.num_vertices; ++index) {
// Indexed rendering doesn't use the start offset
@@ -276,17 +283,19 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
if (!vertex_cache_hit) {
// Initialize data for the current vertex
- Shader::InputVertex input;
+ Shader::AttributeBuffer input, output{};
loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
// Send to vertex shader
if (g_debug_context)
g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
(void*)&input);
- g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes());
+ shader_unit.LoadInput(regs.vs, input);
+ shader_engine->Run(g_state.vs, shader_unit);
+ shader_unit.WriteOutput(regs.vs, output);
// Retrieve vertex from register data
- output_vertex = shader_unit.output_registers.ToVertex(regs.vs);
+ output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs, output);
if (is_indexed) {
vertex_cache[vertex_cache_pos] = output_vertex;
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index c44b3d95a..2d40f7d4f 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -35,6 +35,7 @@
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
#include "video_core/shader/shader.h"
+#include "video_core/texture/texture_decode.h"
#include "video_core/utils.h"
#include "video_core/video_core.h"
@@ -315,257 +316,6 @@ std::unique_ptr<PicaTrace> FinishPicaTracing() {
return ret;
}
-const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const TextureInfo& info,
- bool disable_alpha) {
- const unsigned int coarse_x = x & ~7;
- const unsigned int coarse_y = y & ~7;
-
- if (info.format != Regs::TextureFormat::ETC1 && info.format != Regs::TextureFormat::ETC1A4) {
- // TODO(neobrain): Fix code design to unify vertical block offsets!
- source += coarse_y * info.stride;
- }
-
- // TODO: Assert that width/height are multiples of block dimensions
-
- switch (info.format) {
- case Regs::TextureFormat::RGBA8: {
- auto res = Color::DecodeRGBA8(source + VideoCore::GetMortonOffset(x, y, 4));
- return {res.r(), res.g(), res.b(), static_cast<u8>(disable_alpha ? 255 : res.a())};
- }
-
- case Regs::TextureFormat::RGB8: {
- auto res = Color::DecodeRGB8(source + VideoCore::GetMortonOffset(x, y, 3));
- return {res.r(), res.g(), res.b(), 255};
- }
-
- case Regs::TextureFormat::RGB5A1: {
- auto res = Color::DecodeRGB5A1(source + VideoCore::GetMortonOffset(x, y, 2));
- return {res.r(), res.g(), res.b(), static_cast<u8>(disable_alpha ? 255 : res.a())};
- }
-
- case Regs::TextureFormat::RGB565: {
- auto res = Color::DecodeRGB565(source + VideoCore::GetMortonOffset(x, y, 2));
- return {res.r(), res.g(), res.b(), 255};
- }
-
- case Regs::TextureFormat::RGBA4: {
- auto res = Color::DecodeRGBA4(source + VideoCore::GetMortonOffset(x, y, 2));
- return {res.r(), res.g(), res.b(), static_cast<u8>(disable_alpha ? 255 : res.a())};
- }
-
- case Regs::TextureFormat::IA8: {
- const u8* source_ptr = source + VideoCore::GetMortonOffset(x, y, 2);
-
- if (disable_alpha) {
- // Show intensity as red, alpha as green
- return {source_ptr[1], source_ptr[0], 0, 255};
- } else {
- return {source_ptr[1], source_ptr[1], source_ptr[1], source_ptr[0]};
- }
- }
-
- case Regs::TextureFormat::RG8: {
- auto res = Color::DecodeRG8(source + VideoCore::GetMortonOffset(x, y, 2));
- return {res.r(), res.g(), 0, 255};
- }
-
- case Regs::TextureFormat::I8: {
- const u8* source_ptr = source + VideoCore::GetMortonOffset(x, y, 1);
- return {*source_ptr, *source_ptr, *source_ptr, 255};
- }
-
- case Regs::TextureFormat::A8: {
- const u8* source_ptr = source + VideoCore::GetMortonOffset(x, y, 1);
-
- if (disable_alpha) {
- return {*source_ptr, *source_ptr, *source_ptr, 255};
- } else {
- return {0, 0, 0, *source_ptr};
- }
- }
-
- case Regs::TextureFormat::IA4: {
- const u8* source_ptr = source + VideoCore::GetMortonOffset(x, y, 1);
-
- u8 i = Color::Convert4To8(((*source_ptr) & 0xF0) >> 4);
- u8 a = Color::Convert4To8((*source_ptr) & 0xF);
-
- if (disable_alpha) {
- // Show intensity as red, alpha as green
- return {i, a, 0, 255};
- } else {
- return {i, i, i, a};
- }
- }
-
- case Regs::TextureFormat::I4: {
- u32 morton_offset = VideoCore::GetMortonOffset(x, y, 1);
- const u8* source_ptr = source + morton_offset / 2;
-
- u8 i = (morton_offset % 2) ? ((*source_ptr & 0xF0) >> 4) : (*source_ptr & 0xF);
- i = Color::Convert4To8(i);
-
- return {i, i, i, 255};
- }
-
- case Regs::TextureFormat::A4: {
- u32 morton_offset = VideoCore::GetMortonOffset(x, y, 1);
- const u8* source_ptr = source + morton_offset / 2;
-
- u8 a = (morton_offset % 2) ? ((*source_ptr & 0xF0) >> 4) : (*source_ptr & 0xF);
- a = Color::Convert4To8(a);
-
- if (disable_alpha) {
- return {a, a, a, 255};
- } else {
- return {0, 0, 0, a};
- }
- }
-
- case Regs::TextureFormat::ETC1:
- case Regs::TextureFormat::ETC1A4: {
- bool has_alpha = (info.format == Regs::TextureFormat::ETC1A4);
-
- // ETC1 further subdivides each 8x8 tile into four 4x4 subtiles
- const int subtile_width = 4;
- const int subtile_height = 4;
-
- int subtile_index = ((x / subtile_width) & 1) + 2 * ((y / subtile_height) & 1);
- unsigned subtile_bytes = has_alpha ? 2 : 1; // TODO: Name...
-
- const u64* source_ptr = (const u64*)(source + coarse_x * subtile_bytes * 4 +
- coarse_y * subtile_bytes * 4 * (info.width / 8) +
- subtile_index * subtile_bytes * 8);
- u64 alpha = 0xFFFFFFFFFFFFFFFF;
- if (has_alpha) {
- alpha = *source_ptr;
- source_ptr++;
- }
-
- union ETC1Tile {
- // Each of these two is a collection of 16 bits (one per lookup value)
- BitField<0, 16, u64> table_subindexes;
- BitField<16, 16, u64> negation_flags;
-
- unsigned GetTableSubIndex(unsigned index) const {
- return (table_subindexes >> index) & 1;
- }
-
- bool GetNegationFlag(unsigned index) const {
- return ((negation_flags >> index) & 1) == 1;
- }
-
- BitField<32, 1, u64> flip;
- BitField<33, 1, u64> differential_mode;
-
- BitField<34, 3, u64> table_index_2;
- BitField<37, 3, u64> table_index_1;
-
- union {
- // delta value + base value
- BitField<40, 3, s64> db;
- BitField<43, 5, u64> b;
-
- BitField<48, 3, s64> dg;
- BitField<51, 5, u64> g;
-
- BitField<56, 3, s64> dr;
- BitField<59, 5, u64> r;
- } differential;
-
- union {
- BitField<40, 4, u64> b2;
- BitField<44, 4, u64> b1;
-
- BitField<48, 4, u64> g2;
- BitField<52, 4, u64> g1;
-
- BitField<56, 4, u64> r2;
- BitField<60, 4, u64> r1;
- } separate;
-
- const Math::Vec3<u8> GetRGB(int x, int y) const {
- int texel = 4 * x + y;
-
- if (flip)
- std::swap(x, y);
-
- // Lookup base value
- Math::Vec3<int> ret;
- if (differential_mode) {
- ret.r() = static_cast<int>(differential.r);
- ret.g() = static_cast<int>(differential.g);
- ret.b() = static_cast<int>(differential.b);
- if (x >= 2) {
- ret.r() += static_cast<int>(differential.dr);
- ret.g() += static_cast<int>(differential.dg);
- ret.b() += static_cast<int>(differential.db);
- }
- ret.r() = Color::Convert5To8(ret.r());
- ret.g() = Color::Convert5To8(ret.g());
- ret.b() = Color::Convert5To8(ret.b());
- } else {
- if (x < 2) {
- ret.r() = Color::Convert4To8(static_cast<u8>(separate.r1));
- ret.g() = Color::Convert4To8(static_cast<u8>(separate.g1));
- ret.b() = Color::Convert4To8(static_cast<u8>(separate.b1));
- } else {
- ret.r() = Color::Convert4To8(static_cast<u8>(separate.r2));
- ret.g() = Color::Convert4To8(static_cast<u8>(separate.g2));
- ret.b() = Color::Convert4To8(static_cast<u8>(separate.b2));
- }
- }
-
- // Add modifier
- unsigned table_index =
- static_cast<int>((x < 2) ? table_index_1.Value() : table_index_2.Value());
-
- static const std::array<std::array<u8, 2>, 8> etc1_modifier_table = {{
- {{2, 8}},
- {{5, 17}},
- {{9, 29}},
- {{13, 42}},
- {{18, 60}},
- {{24, 80}},
- {{33, 106}},
- {{47, 183}},
- }};
-
- int modifier = etc1_modifier_table.at(table_index).at(GetTableSubIndex(texel));
- if (GetNegationFlag(texel))
- modifier *= -1;
-
- ret.r() = MathUtil::Clamp(ret.r() + modifier, 0, 255);
- ret.g() = MathUtil::Clamp(ret.g() + modifier, 0, 255);
- ret.b() = MathUtil::Clamp(ret.b() + modifier, 0, 255);
-
- return ret.Cast<u8>();
- }
- } const* etc1_tile = reinterpret_cast<const ETC1Tile*>(source_ptr);
-
- alpha >>= 4 * ((x & 3) * 4 + (y & 3));
- return Math::MakeVec(etc1_tile->GetRGB(x & 3, y & 3),
- disable_alpha ? (u8)255 : Color::Convert4To8(alpha & 0xF));
- }
-
- default:
- LOG_ERROR(HW_GPU, "Unknown texture format: %x", (u32)info.format);
- DEBUG_ASSERT(false);
- return {};
- }
-}
-
-TextureInfo TextureInfo::FromPicaRegister(const Regs::TextureConfig& config,
- const Regs::TextureFormat& format) {
- TextureInfo info;
- info.physical_address = config.GetPhysicalAddress();
- info.width = config.width;
- info.height = config.height;
- info.format = format;
- info.stride = Pica::Regs::NibblesPerPixel(info.format) * info.width / 2;
- return info;
-}
-
#ifdef HAVE_PNG
// Adapter functions to libpng to write/flush to File::IOFile instances.
static void WriteIOFile(png_structp png_ptr, png_bytep data, png_size_t length) {
@@ -642,12 +392,12 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
buf = new u8[row_stride * texture_config.height];
for (unsigned y = 0; y < texture_config.height; ++y) {
for (unsigned x = 0; x < texture_config.width; ++x) {
- TextureInfo info;
+ Pica::Texture::TextureInfo info;
info.width = texture_config.width;
info.height = texture_config.height;
info.stride = row_stride;
info.format = g_state.regs.texture0_format;
- Math::Vec4<u8> texture_color = LookupTexture(data, x, y, info);
+ Math::Vec4<u8> texture_color = Pica::Texture::LookupTexture(data, x, y, info);
buf[3 * x + y * row_stride] = texture_color.r();
buf[3 * x + y * row_stride + 1] = texture_color.g();
buf[3 * x + y * row_stride + 2] = texture_color.b();
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index 46ea8d9c7..938a2e1b5 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -205,31 +205,6 @@ inline bool IsPicaTracing() {
void OnPicaRegWrite(PicaTrace::Write write);
std::unique_ptr<PicaTrace> FinishPicaTracing();
-struct TextureInfo {
- PAddr physical_address;
- int width;
- int height;
- int stride;
- Pica::Regs::TextureFormat format;
-
- static TextureInfo FromPicaRegister(const Pica::Regs::TextureConfig& config,
- const Pica::Regs::TextureFormat& format);
-};
-
-/**
- * Lookup texel located at the given coordinates and return an RGBA vector of its color.
- * @param source Source pointer to read data from
- * @param s,t Texture coordinates to read from
- * @param info TextureInfo object describing the texture setup
- * @param disable_alpha This is used for debug widgets which use this method to display textures
- * without providing a good way to visualize alpha by themselves. If true, this will return 255 for
- * the alpha component, and either drop the information entirely or store it in an "unused" color
- * channel.
- * @todo Eventually we should get rid of the disable_alpha parameter.
- */
-const Math::Vec4<u8> LookupTexture(const u8* source, int s, int t, const TextureInfo& info,
- bool disable_alpha = false);
-
void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data);
std::string GetTevStageConfigColorCombinerString(const Pica::Regs::TevStageConfig& tev_stage);
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index ce2bd455e..b4a77c632 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -499,7 +499,7 @@ void Init() {
}
void Shutdown() {
- Shader::ClearCache();
+ Shader::Shutdown();
}
template <typename T>
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index b2db609ec..731540b99 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -99,7 +99,8 @@ struct Regs {
TEXCOORD1_U = 14,
TEXCOORD1_V = 15,
- // TODO: Not verified
+ TEXCOORD0_W = 16,
+
VIEW_X = 18,
VIEW_Y = 19,
VIEW_Z = 20,
@@ -275,8 +276,11 @@ struct Regs {
case TextureFormat::I8:
case TextureFormat::A8:
case TextureFormat::IA4:
- default: // placeholder for yet unknown formats
return 2;
+
+ default: // placeholder for yet unknown formats
+ UNIMPLEMENTED();
+ return 0;
}
}
@@ -868,7 +872,7 @@ struct Regs {
LightSrc light[8];
LightColor global_ambient; // Emission + (material.ambient * lighting.ambient)
INSERT_PADDING_WORDS(0x1);
- BitField<0, 3, u32> num_lights; // Number of enabled lights - 1
+ BitField<0, 3, u32> max_light_index; // Number of enabled lights - 1
union {
BitField<2, 2, LightingFresnelSelector> fresnel_selector;
@@ -1045,7 +1049,7 @@ struct Regs {
BitField<48, 12, u64> attribute_mask;
// number of total attributes minus 1
- BitField<60, 4, u64> num_extra_attributes;
+ BitField<60, 4, u64> max_attribute_index;
};
inline VertexAttributeFormat GetFormat(int n) const {
@@ -1076,7 +1080,7 @@ struct Regs {
}
inline int GetNumTotalAttributes() const {
- return (int)num_extra_attributes + 1;
+ return (int)max_attribute_index + 1;
}
// Attribute loaders map the source vertex data to input attributes
@@ -1176,7 +1180,12 @@ struct Regs {
}
} command_buffer;
- INSERT_PADDING_WORDS(0x07);
+ INSERT_PADDING_WORDS(4);
+
+ /// Number of input attributes to the vertex shader minus 1
+ BitField<0, 4, u32> max_input_attrib_index;
+
+ INSERT_PADDING_WORDS(2);
enum class GPUMode : u32 {
Drawing = 0,
@@ -1214,42 +1223,21 @@ struct Regs {
union {
// Number of input attributes to shader unit - 1
- BitField<0, 4, u32> num_input_attributes;
+ BitField<0, 4, u32> max_input_attribute_index;
};
// Offset to shader program entry point (in words)
BitField<0, 16, u32> main_offset;
- union {
- BitField<0, 4, u64> attribute0_register;
- BitField<4, 4, u64> attribute1_register;
- BitField<8, 4, u64> attribute2_register;
- BitField<12, 4, u64> attribute3_register;
- BitField<16, 4, u64> attribute4_register;
- BitField<20, 4, u64> attribute5_register;
- BitField<24, 4, u64> attribute6_register;
- BitField<28, 4, u64> attribute7_register;
- BitField<32, 4, u64> attribute8_register;
- BitField<36, 4, u64> attribute9_register;
- BitField<40, 4, u64> attribute10_register;
- BitField<44, 4, u64> attribute11_register;
- BitField<48, 4, u64> attribute12_register;
- BitField<52, 4, u64> attribute13_register;
- BitField<56, 4, u64> attribute14_register;
- BitField<60, 4, u64> attribute15_register;
-
- int GetRegisterForAttribute(int attribute_index) const {
- u64 fields[] = {
- attribute0_register, attribute1_register, attribute2_register,
- attribute3_register, attribute4_register, attribute5_register,
- attribute6_register, attribute7_register, attribute8_register,
- attribute9_register, attribute10_register, attribute11_register,
- attribute12_register, attribute13_register, attribute14_register,
- attribute15_register,
- };
- return (int)fields[attribute_index];
- }
- } input_register_map;
+ /// Maps input attributes to registers. 4-bits per attribute, specifying a register index
+ u32 input_attribute_to_register_map_low;
+ u32 input_attribute_to_register_map_high;
+
+ unsigned int GetRegisterForAttribute(unsigned int attribute_index) const {
+ u64 map = ((u64)input_attribute_to_register_map_high << 32) |
+ (u64)input_attribute_to_register_map_low;
+ return (map >> (attribute_index * 4)) & 0b1111;
+ }
BitField<0, 16, u32> output_mask;
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index e4f2e6d5d..785d05650 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -23,7 +23,7 @@ struct State {
Shader::ShaderSetup vs;
Shader::ShaderSetup gs;
- std::array<Math::Vec4<float24>, 16> vs_default_attributes;
+ Shader::AttributeBuffer input_default_attributes;
struct {
union LutEntry {
@@ -66,7 +66,7 @@ struct State {
/// Struct used to describe immediate mode rendering state
struct ImmediateModeState {
// Used to buffer partial vertices for immediate-mode rendering.
- Shader::InputVertex input_vertex;
+ Shader::AttributeBuffer input_vertex;
// Index of the next attribute to be loaded into `input_vertex`.
u32 current_attribute = 0;
} immediate;
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index be7377290..e71ff5719 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -14,7 +14,7 @@ PrimitiveAssembler<VertexType>::PrimitiveAssembler(Regs::TriangleTopology topolo
: topology(topology), buffer_index(0) {}
template <typename VertexType>
-void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx,
+void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
TriangleHandler triangle_handler) {
switch (topology) {
// TODO: Figure out what's different with TriangleTopology::Shader.
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
index 0384d5984..24da47382 100644
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@@ -15,7 +15,8 @@ namespace Pica {
*/
template <typename VertexType>
struct PrimitiveAssembler {
- using TriangleHandler = std::function<void(VertexType& v0, VertexType& v1, VertexType& v2)>;
+ using TriangleHandler =
+ std::function<void(const VertexType& v0, const VertexType& v1, const VertexType& v2)>;
PrimitiveAssembler(Regs::TriangleTopology topology = Regs::TriangleTopology::List);
@@ -25,7 +26,7 @@ struct PrimitiveAssembler {
* NOTE: We could specify the triangle handler in the constructor, but this way we can
* keep event and handler code next to each other.
*/
- void SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler);
+ void SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler);
/**
* Resets the internal state of the PrimitiveAssembler.
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index b9f5d4533..287d732b5 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -21,6 +21,7 @@
#include "video_core/pica_types.h"
#include "video_core/rasterizer.h"
#include "video_core/shader/shader.h"
+#include "video_core/texture/texture_decode.h"
#include "video_core/utils.h"
namespace Pica {
@@ -307,8 +308,8 @@ MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 24
* Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
* culling via recursion.
*/
-static void ProcessTriangleInternal(const Shader::OutputVertex& v0, const Shader::OutputVertex& v1,
- const Shader::OutputVertex& v2, bool reversed = false) {
+static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Vertex& v2,
+ bool reversed = false) {
const auto& regs = g_state.regs;
MICROPROFILE_SCOPE(GPU_Rasterization);
@@ -579,10 +580,10 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, const Shader
u8* texture_data =
Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
auto info =
- DebugUtils::TextureInfo::FromPicaRegister(texture.config, texture.format);
+ Texture::TextureInfo::FromPicaRegister(texture.config, texture.format);
// TODO: Apply the min and mag filters to the texture
- texture_color[i] = DebugUtils::LookupTexture(texture_data, s, t, info);
+ texture_color[i] = Texture::LookupTexture(texture_data, s, t, info);
#if PICA_DUMP_TEXTURES
DebugUtils::DumpTexture(texture.config, texture_data);
#endif
@@ -1276,8 +1277,7 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, const Shader
}
}
-void ProcessTriangle(const Shader::OutputVertex& v0, const Shader::OutputVertex& v1,
- const Shader::OutputVertex& v2) {
+void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2) {
ProcessTriangleInternal(v0, v1, v2);
}
diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h
index 6cbda3067..3a72ac343 100644
--- a/src/video_core/rasterizer.h
+++ b/src/video_core/rasterizer.h
@@ -4,16 +4,44 @@
#pragma once
-namespace Pica {
+#include "video_core/shader/shader.h"
-namespace Shader {
-struct OutputVertex;
-}
+namespace Pica {
namespace Rasterizer {
-void ProcessTriangle(const Shader::OutputVertex& v0, const Shader::OutputVertex& v1,
- const Shader::OutputVertex& v2);
+struct Vertex : Shader::OutputVertex {
+ Vertex(const OutputVertex& v) : OutputVertex(v) {}
+
+ // Attributes used to store intermediate results
+ // position after perspective divide
+ Math::Vec3<float24> screenpos;
+
+ // Linear interpolation
+ // factor: 0=this, 1=vtx
+ void Lerp(float24 factor, const Vertex& vtx) {
+ pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
+
+ // TODO: Should perform perspective correct interpolation here...
+ tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
+ tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor);
+ tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
+
+ screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
+
+ color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
+ }
+
+ // Linear interpolation
+ // factor: 0=v0, 1=v1
+ static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) {
+ Vertex ret = v0;
+ ret.Lerp(factor, v1);
+ return ret;
+ }
+};
+
+void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2);
} // namespace Rasterizer
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 5a306a5c8..071e4ace0 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -467,7 +467,7 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
// Fragment lighting switches
case PICA_REG_INDEX(lighting.disable):
- case PICA_REG_INDEX(lighting.num_lights):
+ case PICA_REG_INDEX(lighting.max_light_index):
case PICA_REG_INDEX(lighting.config0):
case PICA_REG_INDEX(lighting.config1):
case PICA_REG_INDEX(lighting.abs_lut_input):
@@ -716,8 +716,6 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(PAddr addr, u32 size) {
bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) {
MICROPROFILE_SCOPE(OpenGL_Blits);
- using PixelFormat = CachedSurface::PixelFormat;
- using SurfaceType = CachedSurface::SurfaceType;
CachedSurface src_params;
src_params.addr = config.GetPhysicalInputAddress();
@@ -748,7 +746,8 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
// Adjust the source rectangle to take into account parts of the input lines being cropped
if (config.input_width > config.output_width) {
- src_rect.right -= (config.input_width - config.output_width) * src_surface->res_scale_width;
+ src_rect.right -= static_cast<int>((config.input_width - config.output_width) *
+ src_surface->res_scale_width);
}
// Require destination surface to have same resolution scale as source to preserve scaling
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index e1a9cb361..a1aa07074 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -76,7 +76,7 @@ union PicaShaderConfig {
}
state.fog_mode = regs.fog_mode;
- state.fog_flip = regs.fog_flip;
+ state.fog_flip = regs.fog_flip != 0;
state.combiner_buffer_input = regs.tev_combiner_buffer_input.update_mask_rgb.Value() |
regs.tev_combiner_buffer_input.update_mask_a.Value() << 4;
@@ -84,7 +84,7 @@ union PicaShaderConfig {
// Fragment lighting
state.lighting.enable = !regs.lighting.disable;
- state.lighting.src_num = regs.lighting.num_lights + 1;
+ state.lighting.src_num = regs.lighting.max_light_index + 1;
for (unsigned light_index = 0; light_index < state.lighting.src_num; ++light_index) {
unsigned num = regs.lighting.light_enable.GetNum(light_index);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 85aa06cd5..60380257a 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -17,10 +17,10 @@
#include "common/vector_math.h"
#include "core/frontend/emu_window.h"
#include "core/memory.h"
-#include "video_core/debug_utils/debug_utils.h"
#include "video_core/pica_state.h"
#include "video_core/renderer_opengl/gl_rasterizer_cache.h"
#include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/texture/texture_decode.h"
#include "video_core/utils.h"
#include "video_core/video_core.h"
@@ -172,7 +172,6 @@ bool RasterizerCacheOpenGL::TryBlitSurfaces(CachedSurface* src_surface,
const MathUtil::Rectangle<int>& src_rect,
CachedSurface* dst_surface,
const MathUtil::Rectangle<int>& dst_rect) {
- using SurfaceType = CachedSurface::SurfaceType;
if (!CachedSurface::CheckFormatsBlittable(src_surface->pixel_format,
dst_surface->pixel_format)) {
@@ -340,17 +339,16 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
std::vector<Math::Vec4<u8>> tex_buffer(params.width * params.height);
- Pica::DebugUtils::TextureInfo tex_info;
+ Pica::Texture::TextureInfo tex_info;
tex_info.width = params.width;
tex_info.height = params.height;
- tex_info.stride =
- params.width * CachedSurface::GetFormatBpp(params.pixel_format) / 8;
tex_info.format = (Pica::Regs::TextureFormat)params.pixel_format;
+ tex_info.SetDefaultStride();
tex_info.physical_address = params.addr;
for (unsigned y = 0; y < params.height; ++y) {
for (unsigned x = 0; x < params.width; ++x) {
- tex_buffer[x + params.width * y] = Pica::DebugUtils::LookupTexture(
+ tex_buffer[x + params.width * y] = Pica::Texture::LookupTexture(
texture_src_data, x, params.height - 1 - y, tex_info);
}
}
@@ -513,8 +511,9 @@ CachedSurface* RasterizerCacheOpenGL::GetSurfaceRect(const CachedSurface& params
CachedSurface* RasterizerCacheOpenGL::GetTextureSurface(
const Pica::Regs::FullTextureConfig& config) {
- Pica::DebugUtils::TextureInfo info =
- Pica::DebugUtils::TextureInfo::FromPicaRegister(config.config, config.format);
+
+ Pica::Texture::TextureInfo info =
+ Pica::Texture::TextureInfo::FromPicaRegister(config.config, config.format);
CachedSurface params;
params.addr = info.physical_address;
@@ -556,14 +555,21 @@ RasterizerCacheOpenGL::GetFramebufferSurfaces(const Pica::Regs::FramebufferConfi
color_params.width = depth_params.width = config.GetWidth();
color_params.height = depth_params.height = config.GetHeight();
color_params.is_tiled = depth_params.is_tiled = true;
- if (VideoCore::g_scaled_resolution_enabled) {
- auto layout = VideoCore::g_emu_window->GetFramebufferLayout();
- // Assume same scaling factor for top and bottom screens
+ // Set the internal resolution, assume the same scaling factor for top and bottom screens
+ const Layout::FramebufferLayout& layout = VideoCore::g_emu_window->GetFramebufferLayout();
+ if (Settings::values.resolution_factor == 0.0f) {
+ // Auto - scale resolution to the window size
color_params.res_scale_width = depth_params.res_scale_width =
(float)layout.top_screen.GetWidth() / VideoCore::kScreenTopWidth;
color_params.res_scale_height = depth_params.res_scale_height =
(float)layout.top_screen.GetHeight() / VideoCore::kScreenTopHeight;
+ } else {
+ // Otherwise, scale the resolution by the specified factor
+ color_params.res_scale_width = Settings::values.resolution_factor;
+ depth_params.res_scale_width = Settings::values.resolution_factor;
+ color_params.res_scale_height = Settings::values.resolution_factor;
+ depth_params.res_scale_height = Settings::values.resolution_factor;
}
color_params.addr = config.GetColorBufferPhysicalAddress();
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index b50e8292b..f57fdb3cc 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -8,7 +8,14 @@
#include <memory>
#include <set>
#include <tuple>
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedef"
+#endif
#include <boost/icl/interval_map.hpp>
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
#include <glad/glad.h>
#include "common/assert.h"
#include "common/common_funcs.h"
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 8f278722d..4c4f98ac9 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -293,7 +293,7 @@ static void AppendAlphaTestCondition(std::string& out, Regs::CompareFunc func) {
case CompareFunc::GreaterThanOrEqual: {
static const char* op[] = {"!=", "==", ">=", ">", "<=", "<"};
unsigned index = (unsigned)func - (unsigned)CompareFunc::Equal;
- out += "int(last_tex_env_out.a * 255.0f) " + std::string(op[index]) + " alphatest_ref";
+ out += "int(last_tex_env_out.a * 255.0) " + std::string(op[index]) + " alphatest_ref";
break;
}
@@ -422,16 +422,13 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
if (abs) {
// LUT index is in the range of (0.0, 1.0)
index = lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")"
- : "max(" + index + ", 0.f)";
- return "(FLOAT_255 * clamp(" + index + ", 0.0, 1.0))";
+ : "max(" + index + ", 0.0)";
} else {
// LUT index is in the range of (-1.0, 1.0)
- index = "clamp(" + index + ", -1.0, 1.0)";
- return "(FLOAT_255 * ((" + index + " < 0) ? " + index + " + 2.0 : " + index +
- ") / 2.0)";
+ index = "((" + index + " < 0) ? " + index + " + 2.0 : " + index + ") / 2.0";
}
- return std::string();
+ return "(OFFSET_256 + SCALE_256 * clamp(" + index + ", 0.0, 1.0))";
};
// Gets the lighting lookup table value given the specified sampler and index
@@ -462,7 +459,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
if (light_config.dist_atten_enable) {
std::string index = "(" + light_src + ".dist_atten_scale * length(-view - " +
light_src + ".position) + " + light_src + ".dist_atten_bias)";
- index = "((clamp(" + index + ", 0.0, FLOAT_255)))";
+ index = "(OFFSET_256 + SCALE_256 * clamp(" + index + ", 0.0, 1.0))";
const unsigned lut_num =
((unsigned)Regs::LightingSampler::DistanceAttenuation + light_config.num);
dist_atten = GetLutValue((Regs::LightingSampler)lut_num, index);
@@ -580,8 +577,10 @@ std::string GenerateFragmentShader(const PicaShaderConfig& config) {
#version 330 core
#define NUM_TEV_STAGES 6
#define NUM_LIGHTS 8
-#define LIGHTING_LUT_SIZE 256
-#define FLOAT_255 (255.0 / 256.0)
+
+// Texture coordinate offsets and scales
+#define OFFSET_256 (0.5 / 256.0)
+#define SCALE_256 (255.0 / 256.0)
in vec4 primary_color;
in vec2 texcoord[3];
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index a4aa3c9e0..f5f7ea61d 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -2,14 +2,9 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
-#include <atomic>
#include <cmath>
#include <cstring>
-#include <unordered_map>
-#include <utility>
-#include <boost/range/algorithm/fill.hpp>
-#include "common/bit_field.h"
-#include "common/hash.h"
+#include "common/bit_set.h"
#include "common/logging/log.h"
#include "common/microprofile.h"
#include "video_core/pica.h"
@@ -25,37 +20,32 @@ namespace Pica {
namespace Shader {
-OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const {
+OutputVertex OutputVertex::FromAttributeBuffer(const Regs& regs, AttributeBuffer& input) {
// Setup output data
- OutputVertex ret;
- // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
- // figure out what those circumstances are and enable the remaining outputs then.
- unsigned index = 0;
- for (unsigned i = 0; i < 7; ++i) {
+ union {
+ OutputVertex ret{};
+ std::array<float24, 24> vertex_slots;
+ };
+ static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes.");
- if (index >= g_state.regs.vs_output_total)
- break;
+ unsigned int num_attributes = regs.vs_output_total;
+ ASSERT(num_attributes <= 7);
+ for (unsigned int i = 0; i < num_attributes; ++i) {
+ const auto& output_register_map = regs.vs_output_attributes[i];
- if ((config.output_mask & (1 << i)) == 0)
- continue;
-
- const auto& output_register_map = g_state.regs.vs_output_attributes[index];
-
- u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y,
- output_register_map.map_z, output_register_map.map_w};
+ Regs::VSOutputAttributes::Semantic semantics[4] = {
+ output_register_map.map_x, output_register_map.map_y, output_register_map.map_z,
+ output_register_map.map_w};
for (unsigned comp = 0; comp < 4; ++comp) {
- float24* out = ((float24*)&ret) + semantics[comp];
- if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
- *out = value[i][comp];
- } else {
- // Zero output so that attributes which aren't output won't have denormals in them,
- // which would slow us down later.
- memset(out, 0, sizeof(*out));
+ Regs::VSOutputAttributes::Semantic semantic = semantics[comp];
+ float24* out = &vertex_slots[semantic];
+ if (semantic < vertex_slots.size()) {
+ *out = input.attr[i][comp];
+ } else if (semantic != Regs::VSOutputAttributes::INVALID) {
+ LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic);
}
}
-
- index++;
}
// The hardware takes the absolute and saturates vertex colors like this, *before* doing
@@ -76,84 +66,47 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const {
return ret;
}
-#ifdef ARCHITECTURE_x86_64
-static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map;
-static const JitShader* jit_shader;
-#endif // ARCHITECTURE_x86_64
+void UnitState::LoadInput(const Regs::ShaderConfig& config, const AttributeBuffer& input) {
+ const unsigned max_attribute = config.max_input_attribute_index;
-void ClearCache() {
-#ifdef ARCHITECTURE_x86_64
- shader_map.clear();
-#endif // ARCHITECTURE_x86_64
+ for (unsigned attr = 0; attr <= max_attribute; ++attr) {
+ unsigned reg = config.GetRegisterForAttribute(attr);
+ registers.input[reg] = input.attr[attr];
+ }
}
-void ShaderSetup::Setup() {
-#ifdef ARCHITECTURE_x86_64
- if (VideoCore::g_shader_jit_enabled) {
- u64 cache_key =
- Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
- Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data));
-
- auto iter = shader_map.find(cache_key);
- if (iter != shader_map.end()) {
- jit_shader = iter->second.get();
- } else {
- auto shader = std::make_unique<JitShader>();
- shader->Compile();
- jit_shader = shader.get();
- shader_map[cache_key] = std::move(shader);
- }
+void UnitState::WriteOutput(const Regs::ShaderConfig& config, AttributeBuffer& output) {
+ unsigned int output_i = 0;
+ for (unsigned int reg : Common::BitSet<u32>(config.output_mask)) {
+ output.attr[output_i++] = registers.output[reg];
}
-#endif // ARCHITECTURE_x86_64
}
MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
-void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes) {
- auto& config = g_state.regs.vs;
- auto& setup = g_state.vs;
-
- MICROPROFILE_SCOPE(GPU_Shader);
-
- // Setup input register table
- const auto& attribute_register_map = config.input_register_map;
-
- for (unsigned i = 0; i < num_attributes; i++)
- state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
-
- state.conditional_code[0] = false;
- state.conditional_code[1] = false;
+#ifdef ARCHITECTURE_x86_64
+static std::unique_ptr<JitX64Engine> jit_engine;
+#endif // ARCHITECTURE_x86_64
+static InterpreterEngine interpreter_engine;
+ShaderEngine* GetEngine() {
#ifdef ARCHITECTURE_x86_64
+ // TODO(yuriks): Re-initialize on each change rather than being persistent
if (VideoCore::g_shader_jit_enabled) {
- jit_shader->Run(setup, state, config.main_offset);
- } else {
- DebugData<false> dummy_debug_data;
- RunInterpreter(setup, state, dummy_debug_data, config.main_offset);
+ if (jit_engine == nullptr) {
+ jit_engine = std::make_unique<JitX64Engine>();
+ }
+ return jit_engine.get();
}
-#else
- DebugData<false> dummy_debug_data;
- RunInterpreter(setup, state, dummy_debug_data, config.main_offset);
#endif // ARCHITECTURE_x86_64
-}
-
-DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes,
- const Regs::ShaderConfig& config,
- const ShaderSetup& setup) {
- UnitState state;
- DebugData<true> debug_data;
-
- // Setup input register table
- boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero()));
- const auto& attribute_register_map = config.input_register_map;
- for (unsigned i = 0; i < num_attributes; i++)
- state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
- state.conditional_code[0] = false;
- state.conditional_code[1] = false;
+ return &interpreter_engine;
+}
- RunInterpreter(setup, state, debug_data, config.main_offset);
- return debug_data;
+void Shutdown() {
+#ifdef ARCHITECTURE_x86_64
+ jit_engine = nullptr;
+#endif // ARCHITECTURE_x86_64
}
} // namespace Shader
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 2b07759b9..b188d3edf 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -6,7 +6,6 @@
#include <array>
#include <cstddef>
-#include <memory>
#include <type_traits>
#include <nihstro/shader_bytecode.h>
#include "common/assert.h"
@@ -15,7 +14,6 @@
#include "common/vector_math.h"
#include "video_core/pica.h"
#include "video_core/pica_types.h"
-#include "video_core/shader/debug_data.h"
using nihstro::RegisterType;
using nihstro::SourceRegister;
@@ -25,14 +23,11 @@ namespace Pica {
namespace Shader {
-struct InputVertex {
+struct AttributeBuffer {
alignas(16) Math::Vec4<float24> attr[16];
};
struct OutputVertex {
- OutputVertex() = default;
-
- // VS output attributes
Math::Vec4<float24> pos;
Math::Vec4<float24> quat;
Math::Vec4<float24> color;
@@ -44,49 +39,22 @@ struct OutputVertex {
INSERT_PADDING_WORDS(1);
Math::Vec2<float24> tc2;
- // Padding for optimal alignment
- INSERT_PADDING_WORDS(4);
-
- // Attributes used to store intermediate results
-
- // position after perspective divide
- Math::Vec3<float24> screenpos;
- INSERT_PADDING_WORDS(1);
-
- // Linear interpolation
- // factor: 0=this, 1=vtx
- void Lerp(float24 factor, const OutputVertex& vtx) {
- pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
-
- // TODO: Should perform perspective correct interpolation here...
- tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
- tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor);
- tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
-
- screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
-
- color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
- }
-
- // Linear interpolation
- // factor: 0=v0, 1=v1
- static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) {
- OutputVertex ret = v0;
- ret.Lerp(factor, v1);
- return ret;
- }
+ static OutputVertex FromAttributeBuffer(const Regs& regs, AttributeBuffer& output);
};
+#define ASSERT_POS(var, pos) \
+ static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong " \
+ "offset.")
+ASSERT_POS(pos, Regs::VSOutputAttributes::POSITION_X);
+ASSERT_POS(quat, Regs::VSOutputAttributes::QUATERNION_X);
+ASSERT_POS(color, Regs::VSOutputAttributes::COLOR_R);
+ASSERT_POS(tc0, Regs::VSOutputAttributes::TEXCOORD0_U);
+ASSERT_POS(tc1, Regs::VSOutputAttributes::TEXCOORD1_U);
+ASSERT_POS(tc0_w, Regs::VSOutputAttributes::TEXCOORD0_W);
+ASSERT_POS(view, Regs::VSOutputAttributes::VIEW_X);
+ASSERT_POS(tc2, Regs::VSOutputAttributes::TEXCOORD2_U);
+#undef ASSERT_POS
static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
-static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
-
-struct OutputRegisters {
- OutputRegisters() = default;
-
- alignas(16) Math::Vec4<float24> value[16];
-
- OutputVertex ToVertex(const Regs::ShaderConfig& config) const;
-};
-static_assert(std::is_pod<OutputRegisters>::value, "Structure is not POD");
+static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size");
/**
* This structure contains the state information that needs to be unique for a shader unit. The 3DS
@@ -100,11 +68,10 @@ struct UnitState {
// required to be 16-byte aligned.
alignas(16) Math::Vec4<float24> input[16];
alignas(16) Math::Vec4<float24> temporary[16];
+ alignas(16) Math::Vec4<float24> output[16];
} registers;
static_assert(std::is_pod<Registers>::value, "Structure is not POD");
- OutputRegisters output_registers;
-
bool conditional_code[2];
// Two Address registers and one loop counter
@@ -130,7 +97,7 @@ struct UnitState {
static size_t OutputOffset(const DestRegister& reg) {
switch (reg.GetRegisterType()) {
case RegisterType::Output:
- return offsetof(UnitState, output_registers.value) +
+ return offsetof(UnitState, registers.output) +
reg.GetIndex() * sizeof(Math::Vec4<float24>);
case RegisterType::Temporary:
@@ -142,13 +109,19 @@ struct UnitState {
return 0;
}
}
-};
-/// Clears the shader cache
-void ClearCache();
+ /**
+ * Loads the unit state with an input vertex.
+ *
+ * @param config Shader configuration registers corresponding to the unit.
+ * @param input Attribute buffer to load into the input registers.
+ */
+ void LoadInput(const Regs::ShaderConfig& config, const AttributeBuffer& input);
-struct ShaderSetup {
+ void WriteOutput(const Regs::ShaderConfig& config, AttributeBuffer& output);
+};
+struct ShaderSetup {
struct {
// The float uniforms are accessed by the shader JIT using SSE instructions, and are
// therefore required to be 16-byte aligned.
@@ -173,32 +146,37 @@ struct ShaderSetup {
std::array<u32, 1024> program_code;
std::array<u32, 1024> swizzle_data;
+ /// Data private to ShaderEngines
+ struct EngineData {
+ unsigned int entry_point;
+ /// Used by the JIT, points to a compiled shader object.
+ const void* cached_shader = nullptr;
+ } engine_data;
+};
+
+class ShaderEngine {
+public:
+ virtual ~ShaderEngine() = default;
+
/**
* Performs any shader unit setup that only needs to happen once per shader (as opposed to once
* per vertex, which would happen within the `Run` function).
*/
- void Setup();
+ virtual void SetupBatch(ShaderSetup& setup, unsigned int entry_point) = 0;
/**
- * Runs the currently setup shader
- * @param state Shader unit state, must be setup per shader and per shader unit
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
+ * Runs the currently setup shader.
+ *
+ * @param setup Shader engine state, must be setup with SetupBatch on each shader change.
+ * @param state Shader unit state, must be setup with input data before each shader invocation.
*/
- void Run(UnitState& state, const InputVertex& input, int num_attributes);
-
- /**
- * Produce debug information based on the given shader and input vertex
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
- * @param config Configuration object for the shader pipeline
- * @param setup Setup object for the shader pipeline
- * @return Debug information for this shader with regards to the given vertex
- */
- DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes,
- const Regs::ShaderConfig& config, const ShaderSetup& setup);
+ virtual void Run(const ShaderSetup& setup, UnitState& state) const = 0;
};
+// TODO(yuriks): Remove and make it non-global state somewhere
+ShaderEngine* GetEngine();
+void Shutdown();
+
} // namespace Shader
} // namespace Pica
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 70db4167e..81522b8f5 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -7,10 +7,12 @@
#include <cmath>
#include <numeric>
#include <boost/container/static_vector.hpp>
+#include <boost/range/algorithm/fill.hpp>
#include <nihstro/shader_bytecode.h>
#include "common/assert.h"
#include "common/common_types.h"
#include "common/logging/log.h"
+#include "common/microprofile.h"
#include "common/vector_math.h"
#include "video_core/pica_state.h"
#include "video_core/pica_types.h"
@@ -27,8 +29,6 @@ namespace Pica {
namespace Shader {
-constexpr u32 INVALID_ADDRESS = 0xFFFFFFFF;
-
struct CallStackElement {
u32 final_address; // Address upon which we jump to return_address
u32 return_address; // Where to jump when leaving scope
@@ -39,12 +39,15 @@ struct CallStackElement {
};
template <bool Debug>
-void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data,
- unsigned offset) {
+static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data,
+ unsigned offset) {
// TODO: Is there a maximal size for this?
boost::container::static_vector<CallStackElement, 16> call_stack;
u32 program_counter = offset;
+ state.conditional_code[0] = false;
+ state.conditional_code[1] = false;
+
auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset,
u8 repeat_count, u8 loop_increment) {
// -1 to make sure when incrementing the PC we end up at the correct offset
@@ -75,9 +78,9 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
}
};
- const auto& uniforms = g_state.vs.uniforms;
- const auto& swizzle_data = g_state.vs.swizzle_data;
- const auto& program_code = g_state.vs.program_code;
+ const auto& uniforms = setup.uniforms;
+ const auto& swizzle_data = setup.swizzle_data;
+ const auto& program_code = setup.program_code;
// Placeholder for invalid inputs
static float24 dummy_vec4_float24[4];
@@ -172,7 +175,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
float24* dest =
(instr.common.dest.Value() < 0x10)
- ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0]
+ ? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
: (instr.common.dest.Value() < 0x20)
? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
: dummy_vec4_float24;
@@ -515,7 +518,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
float24* dest =
(instr.mad.dest.Value() < 0x10)
- ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0]
+ ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
: (instr.mad.dest.Value() < 0x20)
? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
: dummy_vec4_float24;
@@ -649,9 +652,33 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
}
}
-// Explicit instantiation
-template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<false>&, unsigned offset);
-template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<true>&, unsigned offset);
+void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) {
+ ASSERT(entry_point < 1024);
+ setup.engine_data.entry_point = entry_point;
+}
+
+MICROPROFILE_DECLARE(GPU_Shader);
+
+void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const {
+
+ MICROPROFILE_SCOPE(GPU_Shader);
+
+ DebugData<false> dummy_debug_data;
+ RunInterpreter(setup, state, dummy_debug_data, setup.engine_data.entry_point);
+}
+
+DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup,
+ const AttributeBuffer& input,
+ const Regs::ShaderConfig& config) const {
+ UnitState state;
+ DebugData<true> debug_data;
+
+ // Setup input register table
+ boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero()));
+ state.LoadInput(config, input);
+ RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point);
+ return debug_data;
+}
} // namespace
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index d31dcd7a6..d7a61e122 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -4,18 +4,27 @@
#pragma once
+#include "video_core/shader/debug_data.h"
+#include "video_core/shader/shader.h"
+
namespace Pica {
namespace Shader {
-struct UnitState;
-
-template <bool Debug>
-struct DebugData;
-
-template <bool Debug>
-void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data,
- unsigned offset);
+class InterpreterEngine final : public ShaderEngine {
+public:
+ void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override;
+ void Run(const ShaderSetup& setup, UnitState& state) const override;
+
+ /**
+ * Produce debug information based on the given shader and input vertex
+ * @param input Input vertex into the shader
+ * @param config Configuration object for the shader pipeline
+ * @return Debug information for this shader with regards to the given vertex
+ */
+ DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const AttributeBuffer& input,
+ const Regs::ShaderConfig& config) const;
+};
} // namespace
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index c588b778b..0ee0dd9ef 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -1,888 +1,48 @@
-// Copyright 2015 Citra Emulator Project
+// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <nihstro/shader_bytecode.h>
-#include <smmintrin.h>
-#include <xmmintrin.h>
-#include "common/assert.h"
-#include "common/logging/log.h"
-#include "common/vector_math.h"
-#include "common/x64/cpu_detect.h"
-#include "common/x64/xbyak_abi.h"
-#include "common/x64/xbyak_util.h"
-#include "video_core/pica_state.h"
-#include "video_core/pica_types.h"
+#include "common/hash.h"
+#include "common/microprofile.h"
#include "video_core/shader/shader.h"
#include "video_core/shader/shader_jit_x64.h"
-
-using namespace Common::X64;
-using namespace Xbyak::util;
-using Xbyak::Label;
-using Xbyak::Reg32;
-using Xbyak::Reg64;
-using Xbyak::Xmm;
+#include "video_core/shader/shader_jit_x64_compiler.h"
namespace Pica {
-
namespace Shader {
-typedef void (JitShader::*JitFunction)(Instruction instr);
-
-const JitFunction instr_table[64] = {
- &JitShader::Compile_ADD, // add
- &JitShader::Compile_DP3, // dp3
- &JitShader::Compile_DP4, // dp4
- &JitShader::Compile_DPH, // dph
- nullptr, // unknown
- &JitShader::Compile_EX2, // ex2
- &JitShader::Compile_LG2, // lg2
- nullptr, // unknown
- &JitShader::Compile_MUL, // mul
- &JitShader::Compile_SGE, // sge
- &JitShader::Compile_SLT, // slt
- &JitShader::Compile_FLR, // flr
- &JitShader::Compile_MAX, // max
- &JitShader::Compile_MIN, // min
- &JitShader::Compile_RCP, // rcp
- &JitShader::Compile_RSQ, // rsq
- nullptr, // unknown
- nullptr, // unknown
- &JitShader::Compile_MOVA, // mova
- &JitShader::Compile_MOV, // mov
- nullptr, // unknown
- nullptr, // unknown
- nullptr, // unknown
- nullptr, // unknown
- &JitShader::Compile_DPH, // dphi
- nullptr, // unknown
- &JitShader::Compile_SGE, // sgei
- &JitShader::Compile_SLT, // slti
- nullptr, // unknown
- nullptr, // unknown
- nullptr, // unknown
- nullptr, // unknown
- nullptr, // unknown
- &JitShader::Compile_NOP, // nop
- &JitShader::Compile_END, // end
- nullptr, // break
- &JitShader::Compile_CALL, // call
- &JitShader::Compile_CALLC, // callc
- &JitShader::Compile_CALLU, // callu
- &JitShader::Compile_IF, // ifu
- &JitShader::Compile_IF, // ifc
- &JitShader::Compile_LOOP, // loop
- nullptr, // emit
- nullptr, // sete
- &JitShader::Compile_JMP, // jmpc
- &JitShader::Compile_JMP, // jmpu
- &JitShader::Compile_CMP, // cmp
- &JitShader::Compile_CMP, // cmp
- &JitShader::Compile_MAD, // madi
- &JitShader::Compile_MAD, // madi
- &JitShader::Compile_MAD, // madi
- &JitShader::Compile_MAD, // madi
- &JitShader::Compile_MAD, // madi
- &JitShader::Compile_MAD, // madi
- &JitShader::Compile_MAD, // madi
- &JitShader::Compile_MAD, // madi
- &JitShader::Compile_MAD, // mad
- &JitShader::Compile_MAD, // mad
- &JitShader::Compile_MAD, // mad
- &JitShader::Compile_MAD, // mad
- &JitShader::Compile_MAD, // mad
- &JitShader::Compile_MAD, // mad
- &JitShader::Compile_MAD, // mad
- &JitShader::Compile_MAD, // mad
-};
-
-// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
-// be used as scratch registers within a compiler function. The other registers have designated
-// purposes, as documented below:
+JitX64Engine::JitX64Engine() = default;
+JitX64Engine::~JitX64Engine() = default;
-/// Pointer to the uniform memory
-static const Reg64 SETUP = r9;
-/// The two 32-bit VS address offset registers set by the MOVA instruction
-static const Reg64 ADDROFFS_REG_0 = r10;
-static const Reg64 ADDROFFS_REG_1 = r11;
-/// VS loop count register (Multiplied by 16)
-static const Reg32 LOOPCOUNT_REG = r12d;
-/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
-static const Reg32 LOOPCOUNT = esi;
-/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16)
-static const Reg32 LOOPINC = edi;
-/// Result of the previous CMP instruction for the X-component comparison
-static const Reg64 COND0 = r13;
-/// Result of the previous CMP instruction for the Y-component comparison
-static const Reg64 COND1 = r14;
-/// Pointer to the UnitState instance for the current VS unit
-static const Reg64 STATE = r15;
-/// SIMD scratch register
-static const Xmm SCRATCH = xmm0;
-/// Loaded with the first swizzled source register, otherwise can be used as a scratch register
-static const Xmm SRC1 = xmm1;
-/// Loaded with the second swizzled source register, otherwise can be used as a scratch register
-static const Xmm SRC2 = xmm2;
-/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
-static const Xmm SRC3 = xmm3;
-/// Additional scratch register
-static const Xmm SCRATCH2 = xmm4;
-/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
-static const Xmm ONE = xmm14;
-/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
-static const Xmm NEGBIT = xmm15;
+void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) {
+ ASSERT(entry_point < 1024);
+ setup.engine_data.entry_point = entry_point;
-// State registers that must not be modified by external functions calls
-// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
-static const BitSet32 persistent_regs = BuildRegSet({
- // Pointers to register blocks
- SETUP, STATE,
- // Cached registers
- ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1,
- // Constants
- ONE, NEGBIT,
-});
+ u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code));
+ u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data));
-/// Raw constant for the source register selector that indicates no swizzling is performed
-static const u8 NO_SRC_REG_SWIZZLE = 0x1b;
-/// Raw constant for the destination register enable mask that indicates all components are enabled
-static const u8 NO_DEST_REG_MASK = 0xf;
-
-/**
- * Get the vertex shader instruction for a given offset in the current shader program
- * @param offset Offset in the current shader program of the instruction
- * @return Instruction at the specified offset
- */
-static Instruction GetVertexShaderInstruction(size_t offset) {
- return {g_state.vs.program_code[offset]};
-}
-
-static void LogCritical(const char* msg) {
- LOG_CRITICAL(HW_GPU, "%s", msg);
-}
-
-void JitShader::Compile_Assert(bool condition, const char* msg) {
- if (!condition) {
- mov(ABI_PARAM1, reinterpret_cast<size_t>(msg));
- CallFarFunction(*this, LogCritical);
- }
-}
-
-/**
- * Loads and swizzles a source register into the specified XMM register.
- * @param instr VS instruction, used for determining how to load the source register
- * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3)
- * @param src_reg SourceRegister object corresponding to the source register to load
- * @param dest Destination XMM register to store the loaded, swizzled source register
- */
-void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
- Xmm dest) {
- Reg64 src_ptr;
- size_t src_offset;
-
- if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
- src_ptr = SETUP;
- src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex());
+ u64 cache_key = code_hash ^ swizzle_hash;
+ auto iter = cache.find(cache_key);
+ if (iter != cache.end()) {
+ setup.engine_data.cached_shader = iter->second.get();
} else {
- src_ptr = STATE;
- src_offset = UnitState::InputOffset(src_reg);
- }
-
- int src_offset_disp = (int)src_offset;
- ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type");
-
- unsigned operand_desc_id;
-
- const bool is_inverted =
- (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
-
- unsigned address_register_index;
- unsigned offset_src;
-
- if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
- instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
- operand_desc_id = instr.mad.operand_desc_id;
- offset_src = is_inverted ? 3 : 2;
- address_register_index = instr.mad.address_register_index;
- } else {
- operand_desc_id = instr.common.operand_desc_id;
- offset_src = is_inverted ? 2 : 1;
- address_register_index = instr.common.address_register_index;
- }
-
- if (src_num == offset_src && address_register_index != 0) {
- switch (address_register_index) {
- case 1: // address offset 1
- movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]);
- break;
- case 2: // address offset 2
- movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]);
- break;
- case 3: // address offset 3
- movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]);
- break;
- default:
- UNREACHABLE();
- break;
- }
- } else {
- // Load the source
- movaps(dest, xword[src_ptr + src_offset_disp]);
- }
-
- SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]};
-
- // Generate instructions for source register swizzling as needed
- u8 sel = swiz.GetRawSelector(src_num);
- if (sel != NO_SRC_REG_SWIZZLE) {
- // Selector component order needs to be reversed for the SHUFPS instruction
- sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2);
-
- // Shuffle inputs for swizzle
- shufps(dest, dest, sel);
- }
-
- // If the source register should be negated, flip the negative bit using XOR
- const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3};
- if (negate[src_num - 1]) {
- xorps(dest, NEGBIT);
+ auto shader = std::make_unique<JitShader>();
+ shader->Compile(&setup.program_code, &setup.swizzle_data);
+ setup.engine_data.cached_shader = shader.get();
+ cache.emplace_hint(iter, cache_key, std::move(shader));
}
}
-void JitShader::Compile_DestEnable(Instruction instr, Xmm src) {
- DestRegister dest;
- unsigned operand_desc_id;
- if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
- instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
- operand_desc_id = instr.mad.operand_desc_id;
- dest = instr.mad.dest.Value();
- } else {
- operand_desc_id = instr.common.operand_desc_id;
- dest = instr.common.dest.Value();
- }
-
- SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]};
-
- size_t dest_offset_disp = UnitState::OutputOffset(dest);
-
- // If all components are enabled, write the result to the destination register
- if (swiz.dest_mask == NO_DEST_REG_MASK) {
- // Store dest back to memory
- movaps(xword[STATE + dest_offset_disp], src);
-
- } else {
- // Not all components are enabled, so mask the result when storing to the destination
- // register...
- movaps(SCRATCH, xword[STATE + dest_offset_disp]);
-
- if (Common::GetCPUCaps().sse4_1) {
- u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) |
- ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
- blendps(SCRATCH, src, mask);
- } else {
- movaps(SCRATCH2, src);
- unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination
- unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination
-
- // Compute selector to selectively copy source components to destination for SHUFPS
- // instruction
- u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) |
- ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
- ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
- ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
- shufps(SCRATCH, SCRATCH2, sel);
- }
-
- // Store dest back to memory
- movaps(xword[STATE + dest_offset_disp], SCRATCH);
- }
-}
-
-void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) {
- movaps(scratch, src1);
- cmpordps(scratch, src2);
-
- mulps(src1, src2);
+MICROPROFILE_DECLARE(GPU_Shader);
- movaps(src2, src1);
- cmpunordps(src2, src2);
+void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state) const {
+ ASSERT(setup.engine_data.cached_shader != nullptr);
- xorps(scratch, src2);
- andps(src1, scratch);
-}
-
-void JitShader::Compile_EvaluateCondition(Instruction instr) {
- // Note: NXOR is used below to check for equality
- switch (instr.flow_control.op) {
- case Instruction::FlowControlType::Or:
- mov(eax, COND0);
- mov(ebx, COND1);
- xor(eax, (instr.flow_control.refx.Value() ^ 1));
- xor(ebx, (instr.flow_control.refy.Value() ^ 1));
- or (eax, ebx);
- break;
-
- case Instruction::FlowControlType::And:
- mov(eax, COND0);
- mov(ebx, COND1);
- xor(eax, (instr.flow_control.refx.Value() ^ 1));
- xor(ebx, (instr.flow_control.refy.Value() ^ 1));
- and(eax, ebx);
- break;
-
- case Instruction::FlowControlType::JustX:
- mov(eax, COND0);
- xor(eax, (instr.flow_control.refx.Value() ^ 1));
- break;
-
- case Instruction::FlowControlType::JustY:
- mov(eax, COND1);
- xor(eax, (instr.flow_control.refy.Value() ^ 1));
- break;
- }
-}
+ MICROPROFILE_SCOPE(GPU_Shader);
-void JitShader::Compile_UniformCondition(Instruction instr) {
- size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id);
- cmp(byte[SETUP + offset], 0);
+ const JitShader* shader = static_cast<const JitShader*>(setup.engine_data.cached_shader);
+ shader->Run(setup, state, setup.engine_data.entry_point);
}
-BitSet32 JitShader::PersistentCallerSavedRegs() {
- return persistent_regs & ABI_ALL_CALLER_SAVED;
-}
-
-void JitShader::Compile_ADD(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- addps(SRC1, SRC2);
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_DP3(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
-
- Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
-
- movaps(SRC2, SRC1);
- shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1));
-
- movaps(SRC3, SRC1);
- shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2));
-
- shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
- addps(SRC1, SRC2);
- addps(SRC1, SRC3);
-
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_DP4(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
-
- Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
-
- movaps(SRC2, SRC1);
- shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
- addps(SRC1, SRC2);
-
- movaps(SRC2, SRC1);
- shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
- addps(SRC1, SRC2);
-
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_DPH(Instruction instr) {
- if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
- } else {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- }
-
- if (Common::GetCPUCaps().sse4_1) {
- // Set 4th component to 1.0
- blendps(SRC1, ONE, 0b1000);
- } else {
- // Set 4th component to 1.0
- movaps(SCRATCH, SRC1);
- unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__
- unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1
- }
-
- Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
-
- movaps(SRC2, SRC1);
- shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
- addps(SRC1, SRC2);
-
- movaps(SRC2, SRC1);
- shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
- addps(SRC1, SRC2);
-
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_EX2(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- movss(xmm0, SRC1); // ABI_PARAM1
-
- ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
- CallFarFunction(*this, exp2f);
- ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
-
- shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
- movaps(SRC1, xmm0);
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_LG2(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- movss(xmm0, SRC1); // ABI_PARAM1
-
- ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
- CallFarFunction(*this, log2f);
- ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
-
- shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
- movaps(SRC1, xmm0);
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_MUL(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_SGE(Instruction instr) {
- if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
- } else {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- }
-
- cmpleps(SRC2, SRC1);
- andps(SRC2, ONE);
-
- Compile_DestEnable(instr, SRC2);
-}
-
-void JitShader::Compile_SLT(Instruction instr) {
- if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
- } else {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- }
-
- cmpltps(SRC1, SRC2);
- andps(SRC1, ONE);
-
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_FLR(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
-
- if (Common::GetCPUCaps().sse4_1) {
- roundps(SRC1, SRC1, _MM_FROUND_FLOOR);
- } else {
- cvttps2dq(SRC1, SRC1);
- cvtdq2ps(SRC1, SRC1);
- }
-
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_MAX(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
- maxps(SRC1, SRC2);
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_MIN(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
- // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
- minps(SRC1, SRC2);
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_MOVA(Instruction instr) {
- SwizzlePattern swiz = {g_state.vs.swizzle_data[instr.common.operand_desc_id]};
-
- if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
- return; // NoOp
- }
-
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
-
- // Convert floats to integers using truncation (only care about X and Y components)
- cvttps2dq(SRC1, SRC1);
-
- // Get result
- movq(rax, SRC1);
-
- // Handle destination enable
- if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) {
- // Move and sign-extend low 32 bits
- movsxd(ADDROFFS_REG_0, eax);
-
- // Move and sign-extend high 32 bits
- shr(rax, 32);
- movsxd(ADDROFFS_REG_1, eax);
-
- // Multiply by 16 to be used as an offset later
- shl(ADDROFFS_REG_0, 4);
- shl(ADDROFFS_REG_1, 4);
- } else {
- if (swiz.DestComponentEnabled(0)) {
- // Move and sign-extend low 32 bits
- movsxd(ADDROFFS_REG_0, eax);
-
- // Multiply by 16 to be used as an offset later
- shl(ADDROFFS_REG_0, 4);
- } else if (swiz.DestComponentEnabled(1)) {
- // Move and sign-extend high 32 bits
- shr(rax, 32);
- movsxd(ADDROFFS_REG_1, eax);
-
- // Multiply by 16 to be used as an offset later
- shl(ADDROFFS_REG_1, 4);
- }
- }
-}
-
-void JitShader::Compile_MOV(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_RCP(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
-
- // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
- // performs this operation more accurately. This should be checked on hardware.
- rcpss(SRC1, SRC1);
- shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
-
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_RSQ(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
-
- // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
- // performs this operation more accurately. This should be checked on hardware.
- rsqrtss(SRC1, SRC1);
- shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
-
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_NOP(Instruction instr) {}
-
-void JitShader::Compile_END(Instruction instr) {
- ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
- ret();
-}
-
-void JitShader::Compile_CALL(Instruction instr) {
- // Push offset of the return
- push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions));
-
- // Call the subroutine
- call(instruction_labels[instr.flow_control.dest_offset]);
-
- // Skip over the return offset that's on the stack
- add(rsp, 8);
-}
-
-void JitShader::Compile_CALLC(Instruction instr) {
- Compile_EvaluateCondition(instr);
- Label b;
- jz(b);
- Compile_CALL(instr);
- L(b);
-}
-
-void JitShader::Compile_CALLU(Instruction instr) {
- Compile_UniformCondition(instr);
- Label b;
- jz(b);
- Compile_CALL(instr);
- L(b);
-}
-
-void JitShader::Compile_CMP(Instruction instr) {
- using Op = Instruction::Common::CompareOpType::Op;
- Op op_x = instr.common.compare_op.x;
- Op op_y = instr.common.compare_op.y;
-
- Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
- Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
-
- // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
- // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
- // because they don't match when used with NaNs.
- static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE};
-
- bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
- Xmm lhs_x = invert_op_x ? SRC2 : SRC1;
- Xmm rhs_x = invert_op_x ? SRC1 : SRC2;
-
- if (op_x == op_y) {
- // Compare X-component and Y-component together
- cmpps(lhs_x, rhs_x, cmp[op_x]);
- movq(COND0, lhs_x);
-
- mov(COND1, COND0);
- } else {
- bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
- Xmm lhs_y = invert_op_y ? SRC2 : SRC1;
- Xmm rhs_y = invert_op_y ? SRC1 : SRC2;
-
- // Compare X-component
- movaps(SCRATCH, lhs_x);
- cmpss(SCRATCH, rhs_x, cmp[op_x]);
-
- // Compare Y-component
- cmpps(lhs_y, rhs_y, cmp[op_y]);
-
- movq(COND0, SCRATCH);
- movq(COND1, lhs_y);
- }
-
- shr(COND0.cvt32(), 31); // ignores upper 32 bits in source
- shr(COND1, 63);
-}
-
-void JitShader::Compile_MAD(Instruction instr) {
- Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1);
-
- if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
- Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2);
- Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3);
- } else {
- Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2);
- Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
- }
-
- Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
- addps(SRC1, SRC3);
-
- Compile_DestEnable(instr, SRC1);
-}
-
-void JitShader::Compile_IF(Instruction instr) {
- Compile_Assert(instr.flow_control.dest_offset >= program_counter,
- "Backwards if-statements not supported");
- Label l_else, l_endif;
-
- // Evaluate the "IF" condition
- if (instr.opcode.Value() == OpCode::Id::IFU) {
- Compile_UniformCondition(instr);
- } else if (instr.opcode.Value() == OpCode::Id::IFC) {
- Compile_EvaluateCondition(instr);
- }
- jz(l_else, T_NEAR);
-
- // Compile the code that corresponds to the condition evaluating as true
- Compile_Block(instr.flow_control.dest_offset);
-
- // If there isn't an "ELSE" condition, we are done here
- if (instr.flow_control.num_instructions == 0) {
- L(l_else);
- return;
- }
-
- jmp(l_endif, T_NEAR);
-
- L(l_else);
- // This code corresponds to the "ELSE" condition
- // Comple the code that corresponds to the condition evaluating as false
- Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions);
-
- L(l_endif);
-}
-
-void JitShader::Compile_LOOP(Instruction instr) {
- Compile_Assert(instr.flow_control.dest_offset >= program_counter,
- "Backwards loops not supported");
- Compile_Assert(!looping, "Nested loops not supported");
-
- looping = true;
-
- // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id.
- // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by
- // 4 bits) to be used as an offset into the 16-byte vector registers later
- size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id);
- mov(LOOPCOUNT, dword[SETUP + offset]);
- mov(LOOPCOUNT_REG, LOOPCOUNT);
- shr(LOOPCOUNT_REG, 4);
- and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start
- mov(LOOPINC, LOOPCOUNT);
- shr(LOOPINC, 12);
- and(LOOPINC, 0xFF0); // Z-component is the incrementer
- movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
- add(LOOPCOUNT, 1); // Iteration count is X-component + 1
-
- Label l_loop_start;
- L(l_loop_start);
-
- Compile_Block(instr.flow_control.dest_offset + 1);
-
- add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component
- sub(LOOPCOUNT, 1); // Increment loop count by 1
- jnz(l_loop_start); // Loop if not equal
-
- looping = false;
-}
-
-void JitShader::Compile_JMP(Instruction instr) {
- if (instr.opcode.Value() == OpCode::Id::JMPC)
- Compile_EvaluateCondition(instr);
- else if (instr.opcode.Value() == OpCode::Id::JMPU)
- Compile_UniformCondition(instr);
- else
- UNREACHABLE();
-
- bool inverted_condition =
- (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1);
-
- Label& b = instruction_labels[instr.flow_control.dest_offset];
- if (inverted_condition) {
- jz(b, T_NEAR);
- } else {
- jnz(b, T_NEAR);
- }
-}
-
-void JitShader::Compile_Block(unsigned end) {
- while (program_counter < end) {
- Compile_NextInstr();
- }
-}
-
-void JitShader::Compile_Return() {
- // Peek return offset on the stack and check if we're at that offset
- mov(rax, qword[rsp + 8]);
- cmp(eax, (program_counter));
-
- // If so, jump back to before CALL
- Label b;
- jnz(b);
- ret();
- L(b);
-}
-
-void JitShader::Compile_NextInstr() {
- if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) {
- Compile_Return();
- }
-
- L(instruction_labels[program_counter]);
-
- Instruction instr = GetVertexShaderInstruction(program_counter++);
-
- OpCode::Id opcode = instr.opcode.Value();
- auto instr_func = instr_table[static_cast<unsigned>(opcode)];
-
- if (instr_func) {
- // JIT the instruction!
- ((*this).*instr_func)(instr);
- } else {
- // Unhandled instruction
- LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)",
- instr.opcode.Value().EffectiveOpCode(), instr.hex);
- }
-}
-
-void JitShader::FindReturnOffsets() {
- return_offsets.clear();
-
- for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) {
- Instruction instr = GetVertexShaderInstruction(offset);
-
- switch (instr.opcode.Value()) {
- case OpCode::Id::CALL:
- case OpCode::Id::CALLC:
- case OpCode::Id::CALLU:
- return_offsets.push_back(instr.flow_control.dest_offset +
- instr.flow_control.num_instructions);
- break;
- default:
- break;
- }
- }
-
- // Sort for efficient binary search later
- std::sort(return_offsets.begin(), return_offsets.end());
-}
-
-void JitShader::Compile() {
- // Reset flow control state
- program = (CompiledShader*)getCurr();
- program_counter = 0;
- looping = false;
- instruction_labels.fill(Xbyak::Label());
-
- // Find all `CALL` instructions and identify return locations
- FindReturnOffsets();
-
- // The stack pointer is 8 modulo 16 at the entry of a procedure
- ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
-
- mov(SETUP, ABI_PARAM1);
- mov(STATE, ABI_PARAM2);
-
- // Zero address/loop registers
- xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32());
- xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32());
- xor(LOOPCOUNT_REG, LOOPCOUNT_REG);
-
- // Used to set a register to one
- static const __m128 one = {1.f, 1.f, 1.f, 1.f};
- mov(rax, reinterpret_cast<size_t>(&one));
- movaps(ONE, xword[rax]);
-
- // Used to negate registers
- static const __m128 neg = {-0.f, -0.f, -0.f, -0.f};
- mov(rax, reinterpret_cast<size_t>(&neg));
- movaps(NEGBIT, xword[rax]);
-
- // Jump to start of the shader program
- jmp(ABI_PARAM3);
-
- // Compile entire program
- Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size()));
-
- // Free memory that's no longer needed
- return_offsets.clear();
- return_offsets.shrink_to_fit();
-
- ready();
-
- uintptr_t size = reinterpret_cast<uintptr_t>(getCurr()) - reinterpret_cast<uintptr_t>(program);
- ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
- LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size);
-}
-
-JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {}
-
} // namespace Shader
-
} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index f37548306..078b2cba5 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -1,121 +1,30 @@
-// Copyright 2015 Citra Emulator Project
+// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
-#include <array>
-#include <cstddef>
-#include <utility>
-#include <vector>
-#include <nihstro/shader_bytecode.h>
-#include <xbyak.h>
-#include "common/bit_set.h"
+#include <memory>
+#include <unordered_map>
#include "common/common_types.h"
-#include "common/x64/emitter.h"
#include "video_core/shader/shader.h"
-using nihstro::Instruction;
-using nihstro::OpCode;
-using nihstro::SwizzlePattern;
-
namespace Pica {
-
namespace Shader {
-/// Memory allocated for each compiled shader (64Kb)
-constexpr size_t MAX_SHADER_SIZE = 1024 * 64;
+class JitShader;
-/**
- * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
- * code that can be executed on the host machine directly.
- */
-class JitShader : public Xbyak::CodeGenerator {
+class JitX64Engine final : public ShaderEngine {
public:
- JitShader();
-
- void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const {
- program(&setup, &state, instruction_labels[offset].getAddress());
- }
-
- void Compile();
+ JitX64Engine();
+ ~JitX64Engine() override;
- void Compile_ADD(Instruction instr);
- void Compile_DP3(Instruction instr);
- void Compile_DP4(Instruction instr);
- void Compile_DPH(Instruction instr);
- void Compile_EX2(Instruction instr);
- void Compile_LG2(Instruction instr);
- void Compile_MUL(Instruction instr);
- void Compile_SGE(Instruction instr);
- void Compile_SLT(Instruction instr);
- void Compile_FLR(Instruction instr);
- void Compile_MAX(Instruction instr);
- void Compile_MIN(Instruction instr);
- void Compile_RCP(Instruction instr);
- void Compile_RSQ(Instruction instr);
- void Compile_MOVA(Instruction instr);
- void Compile_MOV(Instruction instr);
- void Compile_NOP(Instruction instr);
- void Compile_END(Instruction instr);
- void Compile_CALL(Instruction instr);
- void Compile_CALLC(Instruction instr);
- void Compile_CALLU(Instruction instr);
- void Compile_IF(Instruction instr);
- void Compile_LOOP(Instruction instr);
- void Compile_JMP(Instruction instr);
- void Compile_CMP(Instruction instr);
- void Compile_MAD(Instruction instr);
+ void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override;
+ void Run(const ShaderSetup& setup, UnitState& state) const override;
private:
- void Compile_Block(unsigned end);
- void Compile_NextInstr();
-
- void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
- Xbyak::Xmm dest);
- void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest);
-
- /**
- * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
- * zero by inf. Clobbers `src2` and `scratch`.
- */
- void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
-
- void Compile_EvaluateCondition(Instruction instr);
- void Compile_UniformCondition(Instruction instr);
-
- /**
- * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction.
- */
- void Compile_Return();
-
- BitSet32 PersistentCallerSavedRegs();
-
- /**
- * Assertion evaluated at compile-time, but only triggered if executed at runtime.
- * @param msg Message to be logged if the assertion fails.
- */
- void Compile_Assert(bool condition, const char* msg);
-
- /**
- * Analyzes the entire shader program for `CALL` instructions before emitting any code,
- * identifying the locations where a return needs to be inserted.
- */
- void FindReturnOffsets();
-
- /// Mapping of Pica VS instructions to pointers in the emitted code
- std::array<Xbyak::Label, 1024> instruction_labels;
-
- /// Offsets in code where a return needs to be inserted
- std::vector<unsigned> return_offsets;
-
- unsigned program_counter = 0; ///< Offset of the next instruction to decode
- bool looping = false; ///< True if compiling a loop, used to check for nested loops
-
- using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
- CompiledShader* program = nullptr;
+ std::unordered_map<u64, std::unique_ptr<JitShader>> cache;
};
-} // Shader
-
-} // Pica
+} // namespace Shader
+} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp
new file mode 100644
index 000000000..92b35dbc0
--- /dev/null
+++ b/src/video_core/shader/shader_jit_x64_compiler.cpp
@@ -0,0 +1,889 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <nihstro/shader_bytecode.h>
+#include <smmintrin.h>
+#include <xmmintrin.h>
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/vector_math.h"
+#include "common/x64/cpu_detect.h"
+#include "common/x64/xbyak_abi.h"
+#include "common/x64/xbyak_util.h"
+#include "video_core/pica_state.h"
+#include "video_core/pica_types.h"
+#include "video_core/shader/shader.h"
+#include "video_core/shader/shader_jit_x64_compiler.h"
+
+using namespace Common::X64;
+using namespace Xbyak::util;
+using Xbyak::Label;
+using Xbyak::Reg32;
+using Xbyak::Reg64;
+using Xbyak::Xmm;
+
+namespace Pica {
+
+namespace Shader {
+
+typedef void (JitShader::*JitFunction)(Instruction instr);
+
+const JitFunction instr_table[64] = {
+ &JitShader::Compile_ADD, // add
+ &JitShader::Compile_DP3, // dp3
+ &JitShader::Compile_DP4, // dp4
+ &JitShader::Compile_DPH, // dph
+ nullptr, // unknown
+ &JitShader::Compile_EX2, // ex2
+ &JitShader::Compile_LG2, // lg2
+ nullptr, // unknown
+ &JitShader::Compile_MUL, // mul
+ &JitShader::Compile_SGE, // sge
+ &JitShader::Compile_SLT, // slt
+ &JitShader::Compile_FLR, // flr
+ &JitShader::Compile_MAX, // max
+ &JitShader::Compile_MIN, // min
+ &JitShader::Compile_RCP, // rcp
+ &JitShader::Compile_RSQ, // rsq
+ nullptr, // unknown
+ nullptr, // unknown
+ &JitShader::Compile_MOVA, // mova
+ &JitShader::Compile_MOV, // mov
+ nullptr, // unknown
+ nullptr, // unknown
+ nullptr, // unknown
+ nullptr, // unknown
+ &JitShader::Compile_DPH, // dphi
+ nullptr, // unknown
+ &JitShader::Compile_SGE, // sgei
+ &JitShader::Compile_SLT, // slti
+ nullptr, // unknown
+ nullptr, // unknown
+ nullptr, // unknown
+ nullptr, // unknown
+ nullptr, // unknown
+ &JitShader::Compile_NOP, // nop
+ &JitShader::Compile_END, // end
+ nullptr, // break
+ &JitShader::Compile_CALL, // call
+ &JitShader::Compile_CALLC, // callc
+ &JitShader::Compile_CALLU, // callu
+ &JitShader::Compile_IF, // ifu
+ &JitShader::Compile_IF, // ifc
+ &JitShader::Compile_LOOP, // loop
+ nullptr, // emit
+ nullptr, // sete
+ &JitShader::Compile_JMP, // jmpc
+ &JitShader::Compile_JMP, // jmpu
+ &JitShader::Compile_CMP, // cmp
+ &JitShader::Compile_CMP, // cmp
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+};
+
+// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
+// be used as scratch registers within a compiler function. The other registers have designated
+// purposes, as documented below:
+
+/// Pointer to the uniform memory
+static const Reg64 SETUP = r9;
+/// The two 32-bit VS address offset registers set by the MOVA instruction
+static const Reg64 ADDROFFS_REG_0 = r10;
+static const Reg64 ADDROFFS_REG_1 = r11;
+/// VS loop count register (Multiplied by 16)
+static const Reg32 LOOPCOUNT_REG = r12d;
+/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
+static const Reg32 LOOPCOUNT = esi;
+/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16)
+static const Reg32 LOOPINC = edi;
+/// Result of the previous CMP instruction for the X-component comparison
+static const Reg64 COND0 = r13;
+/// Result of the previous CMP instruction for the Y-component comparison
+static const Reg64 COND1 = r14;
+/// Pointer to the UnitState instance for the current VS unit
+static const Reg64 STATE = r15;
+/// SIMD scratch register
+static const Xmm SCRATCH = xmm0;
+/// Loaded with the first swizzled source register, otherwise can be used as a scratch register
+static const Xmm SRC1 = xmm1;
+/// Loaded with the second swizzled source register, otherwise can be used as a scratch register
+static const Xmm SRC2 = xmm2;
+/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
+static const Xmm SRC3 = xmm3;
+/// Additional scratch register
+static const Xmm SCRATCH2 = xmm4;
+/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
+static const Xmm ONE = xmm14;
+/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
+static const Xmm NEGBIT = xmm15;
+
+// State registers that must not be modified by external functions calls
+// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
+static const BitSet32 persistent_regs = BuildRegSet({
+ // Pointers to register blocks
+ SETUP, STATE,
+ // Cached registers
+ ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1,
+ // Constants
+ ONE, NEGBIT,
+ // Loop variables
+ LOOPCOUNT, LOOPINC,
+});
+
+/// Raw constant for the source register selector that indicates no swizzling is performed
+static const u8 NO_SRC_REG_SWIZZLE = 0x1b;
+/// Raw constant for the destination register enable mask that indicates all components are enabled
+static const u8 NO_DEST_REG_MASK = 0xf;
+
+static void LogCritical(const char* msg) {
+ LOG_CRITICAL(HW_GPU, "%s", msg);
+}
+
+void JitShader::Compile_Assert(bool condition, const char* msg) {
+ if (!condition) {
+ mov(ABI_PARAM1, reinterpret_cast<size_t>(msg));
+ CallFarFunction(*this, LogCritical);
+ }
+}
+
+/**
+ * Loads and swizzles a source register into the specified XMM register.
+ * @param instr VS instruction, used for determining how to load the source register
+ * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3)
+ * @param src_reg SourceRegister object corresponding to the source register to load
+ * @param dest Destination XMM register to store the loaded, swizzled source register
+ */
+void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
+ Xmm dest) {
+ Reg64 src_ptr;
+ size_t src_offset;
+
+ if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
+ src_ptr = SETUP;
+ src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex());
+ } else {
+ src_ptr = STATE;
+ src_offset = UnitState::InputOffset(src_reg);
+ }
+
+ int src_offset_disp = (int)src_offset;
+ ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type");
+
+ unsigned operand_desc_id;
+
+ const bool is_inverted =
+ (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
+
+ unsigned address_register_index;
+ unsigned offset_src;
+
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
+ instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
+ operand_desc_id = instr.mad.operand_desc_id;
+ offset_src = is_inverted ? 3 : 2;
+ address_register_index = instr.mad.address_register_index;
+ } else {
+ operand_desc_id = instr.common.operand_desc_id;
+ offset_src = is_inverted ? 2 : 1;
+ address_register_index = instr.common.address_register_index;
+ }
+
+ if (src_num == offset_src && address_register_index != 0) {
+ switch (address_register_index) {
+ case 1: // address offset 1
+ movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]);
+ break;
+ case 2: // address offset 2
+ movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]);
+ break;
+ case 3: // address offset 3
+ movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]);
+ break;
+ default:
+ UNREACHABLE();
+ break;
+ }
+ } else {
+ // Load the source
+ movaps(dest, xword[src_ptr + src_offset_disp]);
+ }
+
+ SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]};
+
+ // Generate instructions for source register swizzling as needed
+ u8 sel = swiz.GetRawSelector(src_num);
+ if (sel != NO_SRC_REG_SWIZZLE) {
+ // Selector component order needs to be reversed for the SHUFPS instruction
+ sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2);
+
+ // Shuffle inputs for swizzle
+ shufps(dest, dest, sel);
+ }
+
+ // If the source register should be negated, flip the negative bit using XOR
+ const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3};
+ if (negate[src_num - 1]) {
+ xorps(dest, NEGBIT);
+ }
+}
+
+void JitShader::Compile_DestEnable(Instruction instr, Xmm src) {
+ DestRegister dest;
+ unsigned operand_desc_id;
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
+ instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
+ operand_desc_id = instr.mad.operand_desc_id;
+ dest = instr.mad.dest.Value();
+ } else {
+ operand_desc_id = instr.common.operand_desc_id;
+ dest = instr.common.dest.Value();
+ }
+
+ SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]};
+
+ size_t dest_offset_disp = UnitState::OutputOffset(dest);
+
+ // If all components are enabled, write the result to the destination register
+ if (swiz.dest_mask == NO_DEST_REG_MASK) {
+ // Store dest back to memory
+ movaps(xword[STATE + dest_offset_disp], src);
+
+ } else {
+ // Not all components are enabled, so mask the result when storing to the destination
+ // register...
+ movaps(SCRATCH, xword[STATE + dest_offset_disp]);
+
+ if (Common::GetCPUCaps().sse4_1) {
+ u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) |
+ ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
+ blendps(SCRATCH, src, mask);
+ } else {
+ movaps(SCRATCH2, src);
+ unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination
+ unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination
+
+ // Compute selector to selectively copy source components to destination for SHUFPS
+ // instruction
+ u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) |
+ ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
+ ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
+ ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
+ shufps(SCRATCH, SCRATCH2, sel);
+ }
+
+ // Store dest back to memory
+ movaps(xword[STATE + dest_offset_disp], SCRATCH);
+ }
+}
+
+void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) {
+ movaps(scratch, src1);
+ cmpordps(scratch, src2);
+
+ mulps(src1, src2);
+
+ movaps(src2, src1);
+ cmpunordps(src2, src2);
+
+ xorps(scratch, src2);
+ andps(src1, scratch);
+}
+
+void JitShader::Compile_EvaluateCondition(Instruction instr) {
+ // Note: NXOR is used below to check for equality
+ switch (instr.flow_control.op) {
+ case Instruction::FlowControlType::Or:
+ mov(eax, COND0);
+ mov(ebx, COND1);
+ xor(eax, (instr.flow_control.refx.Value() ^ 1));
+ xor(ebx, (instr.flow_control.refy.Value() ^ 1));
+ or (eax, ebx);
+ break;
+
+ case Instruction::FlowControlType::And:
+ mov(eax, COND0);
+ mov(ebx, COND1);
+ xor(eax, (instr.flow_control.refx.Value() ^ 1));
+ xor(ebx, (instr.flow_control.refy.Value() ^ 1));
+ and(eax, ebx);
+ break;
+
+ case Instruction::FlowControlType::JustX:
+ mov(eax, COND0);
+ xor(eax, (instr.flow_control.refx.Value() ^ 1));
+ break;
+
+ case Instruction::FlowControlType::JustY:
+ mov(eax, COND1);
+ xor(eax, (instr.flow_control.refy.Value() ^ 1));
+ break;
+ }
+}
+
+void JitShader::Compile_UniformCondition(Instruction instr) {
+ size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id);
+ cmp(byte[SETUP + offset], 0);
+}
+
+BitSet32 JitShader::PersistentCallerSavedRegs() {
+ return persistent_regs & ABI_ALL_CALLER_SAVED;
+}
+
+void JitShader::Compile_ADD(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ addps(SRC1, SRC2);
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_DP3(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+
+ movaps(SRC2, SRC1);
+ shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1));
+
+ movaps(SRC3, SRC1);
+ shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2));
+
+ shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
+ addps(SRC1, SRC2);
+ addps(SRC1, SRC3);
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_DP4(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+
+ movaps(SRC2, SRC1);
+ shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+ addps(SRC1, SRC2);
+
+ movaps(SRC2, SRC1);
+ shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+ addps(SRC1, SRC2);
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_DPH(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
+ if (Common::GetCPUCaps().sse4_1) {
+ // Set 4th component to 1.0
+ blendps(SRC1, ONE, 0b1000);
+ } else {
+ // Set 4th component to 1.0
+ movaps(SCRATCH, SRC1);
+ unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__
+ unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1
+ }
+
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+
+ movaps(SRC2, SRC1);
+ shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+ addps(SRC1, SRC2);
+
+ movaps(SRC2, SRC1);
+ shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+ addps(SRC1, SRC2);
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_EX2(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ movss(xmm0, SRC1); // ABI_PARAM1
+
+ ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+ CallFarFunction(*this, exp2f);
+ ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+
+ shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
+ movaps(SRC1, xmm0);
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_LG2(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ movss(xmm0, SRC1); // ABI_PARAM1
+
+ ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+ CallFarFunction(*this, log2f);
+ ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+
+ shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
+ movaps(SRC1, xmm0);
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_MUL(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_SGE(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
+ cmpleps(SRC2, SRC1);
+ andps(SRC2, ONE);
+
+ Compile_DestEnable(instr, SRC2);
+}
+
+void JitShader::Compile_SLT(Instruction instr) {
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
+ } else {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ }
+
+ cmpltps(SRC1, SRC2);
+ andps(SRC1, ONE);
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_FLR(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+
+ if (Common::GetCPUCaps().sse4_1) {
+ roundps(SRC1, SRC1, _MM_FROUND_FLOOR);
+ } else {
+ cvttps2dq(SRC1, SRC1);
+ cvtdq2ps(SRC1, SRC1);
+ }
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_MAX(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
+ maxps(SRC1, SRC2);
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_MIN(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+ // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
+ minps(SRC1, SRC2);
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_MOVA(Instruction instr) {
+ SwizzlePattern swiz = {(*swizzle_data)[instr.common.operand_desc_id]};
+
+ if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
+ return; // NoOp
+ }
+
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+
+ // Convert floats to integers using truncation (only care about X and Y components)
+ cvttps2dq(SRC1, SRC1);
+
+ // Get result
+ movq(rax, SRC1);
+
+ // Handle destination enable
+ if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) {
+ // Move and sign-extend low 32 bits
+ movsxd(ADDROFFS_REG_0, eax);
+
+ // Move and sign-extend high 32 bits
+ shr(rax, 32);
+ movsxd(ADDROFFS_REG_1, eax);
+
+ // Multiply by 16 to be used as an offset later
+ shl(ADDROFFS_REG_0, 4);
+ shl(ADDROFFS_REG_1, 4);
+ } else {
+ if (swiz.DestComponentEnabled(0)) {
+ // Move and sign-extend low 32 bits
+ movsxd(ADDROFFS_REG_0, eax);
+
+ // Multiply by 16 to be used as an offset later
+ shl(ADDROFFS_REG_0, 4);
+ } else if (swiz.DestComponentEnabled(1)) {
+ // Move and sign-extend high 32 bits
+ shr(rax, 32);
+ movsxd(ADDROFFS_REG_1, eax);
+
+ // Multiply by 16 to be used as an offset later
+ shl(ADDROFFS_REG_1, 4);
+ }
+ }
+}
+
+void JitShader::Compile_MOV(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_RCP(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+
+ // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
+ // performs this operation more accurately. This should be checked on hardware.
+ rcpss(SRC1, SRC1);
+ shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_RSQ(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+
+ // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
+ // performs this operation more accurately. This should be checked on hardware.
+ rsqrtss(SRC1, SRC1);
+ shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_NOP(Instruction instr) {}
+
+void JitShader::Compile_END(Instruction instr) {
+ ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16);
+ ret();
+}
+
+void JitShader::Compile_CALL(Instruction instr) {
+ // Push offset of the return
+ push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions));
+
+ // Call the subroutine
+ call(instruction_labels[instr.flow_control.dest_offset]);
+
+ // Skip over the return offset that's on the stack
+ add(rsp, 8);
+}
+
+void JitShader::Compile_CALLC(Instruction instr) {
+ Compile_EvaluateCondition(instr);
+ Label b;
+ jz(b);
+ Compile_CALL(instr);
+ L(b);
+}
+
+void JitShader::Compile_CALLU(Instruction instr) {
+ Compile_UniformCondition(instr);
+ Label b;
+ jz(b);
+ Compile_CALL(instr);
+ L(b);
+}
+
+void JitShader::Compile_CMP(Instruction instr) {
+ using Op = Instruction::Common::CompareOpType::Op;
+ Op op_x = instr.common.compare_op.x;
+ Op op_y = instr.common.compare_op.y;
+
+ Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+ Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+
+ // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
+ // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
+ // because they don't match when used with NaNs.
+ static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE};
+
+ bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
+ Xmm lhs_x = invert_op_x ? SRC2 : SRC1;
+ Xmm rhs_x = invert_op_x ? SRC1 : SRC2;
+
+ if (op_x == op_y) {
+ // Compare X-component and Y-component together
+ cmpps(lhs_x, rhs_x, cmp[op_x]);
+ movq(COND0, lhs_x);
+
+ mov(COND1, COND0);
+ } else {
+ bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
+ Xmm lhs_y = invert_op_y ? SRC2 : SRC1;
+ Xmm rhs_y = invert_op_y ? SRC1 : SRC2;
+
+ // Compare X-component
+ movaps(SCRATCH, lhs_x);
+ cmpss(SCRATCH, rhs_x, cmp[op_x]);
+
+ // Compare Y-component
+ cmpps(lhs_y, rhs_y, cmp[op_y]);
+
+ movq(COND0, SCRATCH);
+ movq(COND1, lhs_y);
+ }
+
+ shr(COND0.cvt32(), 31); // ignores upper 32 bits in source
+ shr(COND1, 63);
+}
+
+void JitShader::Compile_MAD(Instruction instr) {
+ Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1);
+
+ if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
+ Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2);
+ Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3);
+ } else {
+ Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2);
+ Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
+ }
+
+ Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+ addps(SRC1, SRC3);
+
+ Compile_DestEnable(instr, SRC1);
+}
+
+void JitShader::Compile_IF(Instruction instr) {
+ Compile_Assert(instr.flow_control.dest_offset >= program_counter,
+ "Backwards if-statements not supported");
+ Label l_else, l_endif;
+
+ // Evaluate the "IF" condition
+ if (instr.opcode.Value() == OpCode::Id::IFU) {
+ Compile_UniformCondition(instr);
+ } else if (instr.opcode.Value() == OpCode::Id::IFC) {
+ Compile_EvaluateCondition(instr);
+ }
+ jz(l_else, T_NEAR);
+
+ // Compile the code that corresponds to the condition evaluating as true
+ Compile_Block(instr.flow_control.dest_offset);
+
+ // If there isn't an "ELSE" condition, we are done here
+ if (instr.flow_control.num_instructions == 0) {
+ L(l_else);
+ return;
+ }
+
+ jmp(l_endif, T_NEAR);
+
+ L(l_else);
+ // This code corresponds to the "ELSE" condition
+ // Comple the code that corresponds to the condition evaluating as false
+ Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+
+ L(l_endif);
+}
+
+void JitShader::Compile_LOOP(Instruction instr) {
+ Compile_Assert(instr.flow_control.dest_offset >= program_counter,
+ "Backwards loops not supported");
+ Compile_Assert(!looping, "Nested loops not supported");
+
+ looping = true;
+
+ // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id.
+ // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by
+ // 4 bits) to be used as an offset into the 16-byte vector registers later
+ size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id);
+ mov(LOOPCOUNT, dword[SETUP + offset]);
+ mov(LOOPCOUNT_REG, LOOPCOUNT);
+ shr(LOOPCOUNT_REG, 4);
+ and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start
+ mov(LOOPINC, LOOPCOUNT);
+ shr(LOOPINC, 12);
+ and(LOOPINC, 0xFF0); // Z-component is the incrementer
+ movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
+ add(LOOPCOUNT, 1); // Iteration count is X-component + 1
+
+ Label l_loop_start;
+ L(l_loop_start);
+
+ Compile_Block(instr.flow_control.dest_offset + 1);
+
+ add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component
+ sub(LOOPCOUNT, 1); // Increment loop count by 1
+ jnz(l_loop_start); // Loop if not equal
+
+ looping = false;
+}
+
+void JitShader::Compile_JMP(Instruction instr) {
+ if (instr.opcode.Value() == OpCode::Id::JMPC)
+ Compile_EvaluateCondition(instr);
+ else if (instr.opcode.Value() == OpCode::Id::JMPU)
+ Compile_UniformCondition(instr);
+ else
+ UNREACHABLE();
+
+ bool inverted_condition =
+ (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1);
+
+ Label& b = instruction_labels[instr.flow_control.dest_offset];
+ if (inverted_condition) {
+ jz(b, T_NEAR);
+ } else {
+ jnz(b, T_NEAR);
+ }
+}
+
+void JitShader::Compile_Block(unsigned end) {
+ while (program_counter < end) {
+ Compile_NextInstr();
+ }
+}
+
+void JitShader::Compile_Return() {
+ // Peek return offset on the stack and check if we're at that offset
+ mov(rax, qword[rsp + 8]);
+ cmp(eax, (program_counter));
+
+ // If so, jump back to before CALL
+ Label b;
+ jnz(b);
+ ret();
+ L(b);
+}
+
+void JitShader::Compile_NextInstr() {
+ if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) {
+ Compile_Return();
+ }
+
+ L(instruction_labels[program_counter]);
+
+ Instruction instr = {(*program_code)[program_counter++]};
+
+ OpCode::Id opcode = instr.opcode.Value();
+ auto instr_func = instr_table[static_cast<unsigned>(opcode)];
+
+ if (instr_func) {
+ // JIT the instruction!
+ ((*this).*instr_func)(instr);
+ } else {
+ // Unhandled instruction
+ LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)",
+ instr.opcode.Value().EffectiveOpCode(), instr.hex);
+ }
+}
+
+void JitShader::FindReturnOffsets() {
+ return_offsets.clear();
+
+ for (size_t offset = 0; offset < program_code->size(); ++offset) {
+ Instruction instr = {(*program_code)[offset]};
+
+ switch (instr.opcode.Value()) {
+ case OpCode::Id::CALL:
+ case OpCode::Id::CALLC:
+ case OpCode::Id::CALLU:
+ return_offsets.push_back(instr.flow_control.dest_offset +
+ instr.flow_control.num_instructions);
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Sort for efficient binary search later
+ std::sort(return_offsets.begin(), return_offsets.end());
+}
+
+void JitShader::Compile(const std::array<u32, 1024>* program_code_,
+ const std::array<u32, 1024>* swizzle_data_) {
+ program_code = program_code_;
+ swizzle_data = swizzle_data_;
+
+ // Reset flow control state
+ program = (CompiledShader*)getCurr();
+ program_counter = 0;
+ looping = false;
+ instruction_labels.fill(Xbyak::Label());
+
+ // Find all `CALL` instructions and identify return locations
+ FindReturnOffsets();
+
+ // The stack pointer is 8 modulo 16 at the entry of a procedure
+ // We reserve 16 bytes and assign a dummy value to the first 8 bytes, to catch any potential
+ // return checks (see Compile_Return) that happen in shader main routine.
+ ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16);
+ mov(qword[rsp + 8], 0xFFFFFFFFFFFFFFFFULL);
+
+ mov(SETUP, ABI_PARAM1);
+ mov(STATE, ABI_PARAM2);
+
+ // Zero address/loop registers
+ xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32());
+ xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32());
+ xor(LOOPCOUNT_REG, LOOPCOUNT_REG);
+
+ // Used to set a register to one
+ static const __m128 one = {1.f, 1.f, 1.f, 1.f};
+ mov(rax, reinterpret_cast<size_t>(&one));
+ movaps(ONE, xword[rax]);
+
+ // Used to negate registers
+ static const __m128 neg = {-0.f, -0.f, -0.f, -0.f};
+ mov(rax, reinterpret_cast<size_t>(&neg));
+ movaps(NEGBIT, xword[rax]);
+
+ // Jump to start of the shader program
+ jmp(ABI_PARAM3);
+
+ // Compile entire program
+ Compile_Block(static_cast<unsigned>(program_code->size()));
+
+ // Free memory that's no longer needed
+ program_code = nullptr;
+ swizzle_data = nullptr;
+ return_offsets.clear();
+ return_offsets.shrink_to_fit();
+
+ ready();
+
+ ASSERT_MSG(getSize() <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
+ LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize());
+}
+
+JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {}
+
+} // namespace Shader
+
+} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h
new file mode 100644
index 000000000..599e43ffd
--- /dev/null
+++ b/src/video_core/shader/shader_jit_x64_compiler.h
@@ -0,0 +1,124 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <utility>
+#include <vector>
+#include <nihstro/shader_bytecode.h>
+#include <xbyak.h>
+#include "common/bit_set.h"
+#include "common/common_types.h"
+#include "video_core/shader/shader.h"
+
+using nihstro::Instruction;
+using nihstro::OpCode;
+using nihstro::SwizzlePattern;
+
+namespace Pica {
+
+namespace Shader {
+
+/// Memory allocated for each compiled shader (64Kb)
+constexpr size_t MAX_SHADER_SIZE = 1024 * 64;
+
+/**
+ * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
+ * code that can be executed on the host machine directly.
+ */
+class JitShader : public Xbyak::CodeGenerator {
+public:
+ JitShader();
+
+ void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const {
+ program(&setup, &state, instruction_labels[offset].getAddress());
+ }
+
+ void Compile(const std::array<u32, 1024>* program_code,
+ const std::array<u32, 1024>* swizzle_data);
+
+ void Compile_ADD(Instruction instr);
+ void Compile_DP3(Instruction instr);
+ void Compile_DP4(Instruction instr);
+ void Compile_DPH(Instruction instr);
+ void Compile_EX2(Instruction instr);
+ void Compile_LG2(Instruction instr);
+ void Compile_MUL(Instruction instr);
+ void Compile_SGE(Instruction instr);
+ void Compile_SLT(Instruction instr);
+ void Compile_FLR(Instruction instr);
+ void Compile_MAX(Instruction instr);
+ void Compile_MIN(Instruction instr);
+ void Compile_RCP(Instruction instr);
+ void Compile_RSQ(Instruction instr);
+ void Compile_MOVA(Instruction instr);
+ void Compile_MOV(Instruction instr);
+ void Compile_NOP(Instruction instr);
+ void Compile_END(Instruction instr);
+ void Compile_CALL(Instruction instr);
+ void Compile_CALLC(Instruction instr);
+ void Compile_CALLU(Instruction instr);
+ void Compile_IF(Instruction instr);
+ void Compile_LOOP(Instruction instr);
+ void Compile_JMP(Instruction instr);
+ void Compile_CMP(Instruction instr);
+ void Compile_MAD(Instruction instr);
+
+private:
+ void Compile_Block(unsigned end);
+ void Compile_NextInstr();
+
+ void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
+ Xbyak::Xmm dest);
+ void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest);
+
+ /**
+ * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
+ * zero by inf. Clobbers `src2` and `scratch`.
+ */
+ void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
+
+ void Compile_EvaluateCondition(Instruction instr);
+ void Compile_UniformCondition(Instruction instr);
+
+ /**
+ * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction.
+ */
+ void Compile_Return();
+
+ BitSet32 PersistentCallerSavedRegs();
+
+ /**
+ * Assertion evaluated at compile-time, but only triggered if executed at runtime.
+ * @param msg Message to be logged if the assertion fails.
+ */
+ void Compile_Assert(bool condition, const char* msg);
+
+ /**
+ * Analyzes the entire shader program for `CALL` instructions before emitting any code,
+ * identifying the locations where a return needs to be inserted.
+ */
+ void FindReturnOffsets();
+
+ const std::array<u32, 1024>* program_code = nullptr;
+ const std::array<u32, 1024>* swizzle_data = nullptr;
+
+ /// Mapping of Pica VS instructions to pointers in the emitted code
+ std::array<Xbyak::Label, 1024> instruction_labels;
+
+ /// Offsets in code where a return needs to be inserted
+ std::vector<unsigned> return_offsets;
+
+ unsigned program_counter = 0; ///< Offset of the next instruction to decode
+ bool looping = false; ///< True if compiling a loop, used to check for nested loops
+
+ using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
+ CompiledShader* program = nullptr;
+};
+
+} // Shader
+
+} // Pica
diff --git a/src/video_core/texture/etc1.cpp b/src/video_core/texture/etc1.cpp
new file mode 100644
index 000000000..af60cde1e
--- /dev/null
+++ b/src/video_core/texture/etc1.cpp
@@ -0,0 +1,124 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include "common/bit_field.h"
+#include "common/color.h"
+#include "common/common_types.h"
+#include "common/math_util.h"
+#include "common/vector_math.h"
+#include "video_core/texture/etc1.h"
+
+namespace Pica {
+namespace Texture {
+
+namespace {
+
+constexpr std::array<u8[2], 8> etc1_modifier_table = {{
+ {2, 8}, {5, 17}, {9, 29}, {13, 42}, {18, 60}, {24, 80}, {33, 106}, {47, 183},
+}};
+
+union ETC1Tile {
+ u64 raw;
+
+ // Each of these two is a collection of 16 bits (one per lookup value)
+ BitField<0, 16, u64> table_subindexes;
+ BitField<16, 16, u64> negation_flags;
+
+ unsigned GetTableSubIndex(unsigned index) const {
+ return (table_subindexes >> index) & 1;
+ }
+
+ bool GetNegationFlag(unsigned index) const {
+ return ((negation_flags >> index) & 1) == 1;
+ }
+
+ BitField<32, 1, u64> flip;
+ BitField<33, 1, u64> differential_mode;
+
+ BitField<34, 3, u64> table_index_2;
+ BitField<37, 3, u64> table_index_1;
+
+ union {
+ // delta value + base value
+ BitField<40, 3, s64> db;
+ BitField<43, 5, u64> b;
+
+ BitField<48, 3, s64> dg;
+ BitField<51, 5, u64> g;
+
+ BitField<56, 3, s64> dr;
+ BitField<59, 5, u64> r;
+ } differential;
+
+ union {
+ BitField<40, 4, u64> b2;
+ BitField<44, 4, u64> b1;
+
+ BitField<48, 4, u64> g2;
+ BitField<52, 4, u64> g1;
+
+ BitField<56, 4, u64> r2;
+ BitField<60, 4, u64> r1;
+ } separate;
+
+ const Math::Vec3<u8> GetRGB(unsigned int x, unsigned int y) const {
+ int texel = 4 * x + y;
+
+ if (flip)
+ std::swap(x, y);
+
+ // Lookup base value
+ Math::Vec3<int> ret;
+ if (differential_mode) {
+ ret.r() = static_cast<int>(differential.r);
+ ret.g() = static_cast<int>(differential.g);
+ ret.b() = static_cast<int>(differential.b);
+ if (x >= 2) {
+ ret.r() += static_cast<int>(differential.dr);
+ ret.g() += static_cast<int>(differential.dg);
+ ret.b() += static_cast<int>(differential.db);
+ }
+ ret.r() = Color::Convert5To8(ret.r());
+ ret.g() = Color::Convert5To8(ret.g());
+ ret.b() = Color::Convert5To8(ret.b());
+ } else {
+ if (x < 2) {
+ ret.r() = Color::Convert4To8(static_cast<u8>(separate.r1));
+ ret.g() = Color::Convert4To8(static_cast<u8>(separate.g1));
+ ret.b() = Color::Convert4To8(static_cast<u8>(separate.b1));
+ } else {
+ ret.r() = Color::Convert4To8(static_cast<u8>(separate.r2));
+ ret.g() = Color::Convert4To8(static_cast<u8>(separate.g2));
+ ret.b() = Color::Convert4To8(static_cast<u8>(separate.b2));
+ }
+ }
+
+ // Add modifier
+ unsigned table_index =
+ static_cast<int>((x < 2) ? table_index_1.Value() : table_index_2.Value());
+
+ int modifier = etc1_modifier_table[table_index][GetTableSubIndex(texel)];
+ if (GetNegationFlag(texel))
+ modifier *= -1;
+
+ ret.r() = MathUtil::Clamp(ret.r() + modifier, 0, 255);
+ ret.g() = MathUtil::Clamp(ret.g() + modifier, 0, 255);
+ ret.b() = MathUtil::Clamp(ret.b() + modifier, 0, 255);
+
+ return ret.Cast<u8>();
+ }
+};
+
+} // anonymous namespace
+
+Math::Vec3<u8> SampleETC1Subtile(u64 value, unsigned int x, unsigned int y) {
+ ETC1Tile tile{value};
+ return tile.GetRGB(x, y);
+}
+
+} // namespace Texture
+} // namespace Pica
diff --git a/src/video_core/texture/etc1.h b/src/video_core/texture/etc1.h
new file mode 100644
index 000000000..e188b19df
--- /dev/null
+++ b/src/video_core/texture/etc1.h
@@ -0,0 +1,16 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+#include "common/vector_math.h"
+
+namespace Pica {
+namespace Texture {
+
+Math::Vec3<u8> SampleETC1Subtile(u64 value, unsigned int x, unsigned int y);
+
+} // namespace Texture
+} // namespace Pica
diff --git a/src/video_core/texture/texture_decode.cpp b/src/video_core/texture/texture_decode.cpp
new file mode 100644
index 000000000..f611a1aa9
--- /dev/null
+++ b/src/video_core/texture/texture_decode.cpp
@@ -0,0 +1,229 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/assert.h"
+#include "common/color.h"
+#include "common/logging/log.h"
+#include "common/math_util.h"
+#include "common/swap.h"
+#include "common/vector_math.h"
+#include "video_core/pica.h"
+#include "video_core/texture/etc1.h"
+#include "video_core/texture/texture_decode.h"
+#include "video_core/utils.h"
+
+using TextureFormat = Pica::Regs::TextureFormat;
+
+namespace Pica {
+namespace Texture {
+
+constexpr size_t TILE_SIZE = 8 * 8;
+constexpr size_t ETC1_SUBTILES = 2 * 2;
+
+size_t CalculateTileSize(TextureFormat format) {
+ switch (format) {
+ case TextureFormat::RGBA8:
+ return 4 * TILE_SIZE;
+
+ case TextureFormat::RGB8:
+ return 3 * TILE_SIZE;
+
+ case TextureFormat::RGB5A1:
+ case TextureFormat::RGB565:
+ case TextureFormat::RGBA4:
+ case TextureFormat::IA8:
+ case TextureFormat::RG8:
+ return 2 * TILE_SIZE;
+
+ case TextureFormat::I8:
+ case TextureFormat::A8:
+ case TextureFormat::IA4:
+ return 1 * TILE_SIZE;
+
+ case TextureFormat::I4:
+ case TextureFormat::A4:
+ return TILE_SIZE / 2;
+
+ case TextureFormat::ETC1:
+ return ETC1_SUBTILES * 8;
+
+ case TextureFormat::ETC1A4:
+ return ETC1_SUBTILES * 16;
+
+ default: // placeholder for yet unknown formats
+ UNIMPLEMENTED();
+ return 0;
+ }
+}
+
+Math::Vec4<u8> LookupTexture(const u8* source, unsigned int x, unsigned int y,
+ const TextureInfo& info, bool disable_alpha) {
+ // Coordinate in tiles
+ const unsigned int coarse_x = x / 8;
+ const unsigned int coarse_y = y / 8;
+
+ // Coordinate inside the tile
+ const unsigned int fine_x = x % 8;
+ const unsigned int fine_y = y % 8;
+
+ const u8* line = source + coarse_y * info.stride;
+ const u8* tile = line + coarse_x * CalculateTileSize(info.format);
+ return LookupTexelInTile(tile, fine_x, fine_y, info, disable_alpha);
+}
+
+Math::Vec4<u8> LookupTexelInTile(const u8* source, unsigned int x, unsigned int y,
+ const TextureInfo& info, bool disable_alpha) {
+ DEBUG_ASSERT(x < 8);
+ DEBUG_ASSERT(y < 8);
+
+ using VideoCore::MortonInterleave;
+
+ switch (info.format) {
+ case Regs::TextureFormat::RGBA8: {
+ auto res = Color::DecodeRGBA8(source + MortonInterleave(x, y) * 4);
+ return {res.r(), res.g(), res.b(), static_cast<u8>(disable_alpha ? 255 : res.a())};
+ }
+
+ case Regs::TextureFormat::RGB8: {
+ auto res = Color::DecodeRGB8(source + MortonInterleave(x, y) * 3);
+ return {res.r(), res.g(), res.b(), 255};
+ }
+
+ case Regs::TextureFormat::RGB5A1: {
+ auto res = Color::DecodeRGB5A1(source + MortonInterleave(x, y) * 2);
+ return {res.r(), res.g(), res.b(), static_cast<u8>(disable_alpha ? 255 : res.a())};
+ }
+
+ case Regs::TextureFormat::RGB565: {
+ auto res = Color::DecodeRGB565(source + MortonInterleave(x, y) * 2);
+ return {res.r(), res.g(), res.b(), 255};
+ }
+
+ case Regs::TextureFormat::RGBA4: {
+ auto res = Color::DecodeRGBA4(source + MortonInterleave(x, y) * 2);
+ return {res.r(), res.g(), res.b(), static_cast<u8>(disable_alpha ? 255 : res.a())};
+ }
+
+ case Regs::TextureFormat::IA8: {
+ const u8* source_ptr = source + MortonInterleave(x, y) * 2;
+
+ if (disable_alpha) {
+ // Show intensity as red, alpha as green
+ return {source_ptr[1], source_ptr[0], 0, 255};
+ } else {
+ return {source_ptr[1], source_ptr[1], source_ptr[1], source_ptr[0]};
+ }
+ }
+
+ case Regs::TextureFormat::RG8: {
+ auto res = Color::DecodeRG8(source + MortonInterleave(x, y) * 2);
+ return {res.r(), res.g(), 0, 255};
+ }
+
+ case Regs::TextureFormat::I8: {
+ const u8* source_ptr = source + MortonInterleave(x, y);
+ return {*source_ptr, *source_ptr, *source_ptr, 255};
+ }
+
+ case Regs::TextureFormat::A8: {
+ const u8* source_ptr = source + MortonInterleave(x, y);
+
+ if (disable_alpha) {
+ return {*source_ptr, *source_ptr, *source_ptr, 255};
+ } else {
+ return {0, 0, 0, *source_ptr};
+ }
+ }
+
+ case Regs::TextureFormat::IA4: {
+ const u8* source_ptr = source + MortonInterleave(x, y);
+
+ u8 i = Color::Convert4To8(((*source_ptr) & 0xF0) >> 4);
+ u8 a = Color::Convert4To8((*source_ptr) & 0xF);
+
+ if (disable_alpha) {
+ // Show intensity as red, alpha as green
+ return {i, a, 0, 255};
+ } else {
+ return {i, i, i, a};
+ }
+ }
+
+ case Regs::TextureFormat::I4: {
+ u32 morton_offset = MortonInterleave(x, y);
+ const u8* source_ptr = source + morton_offset / 2;
+
+ u8 i = (morton_offset % 2) ? ((*source_ptr & 0xF0) >> 4) : (*source_ptr & 0xF);
+ i = Color::Convert4To8(i);
+
+ return {i, i, i, 255};
+ }
+
+ case Regs::TextureFormat::A4: {
+ u32 morton_offset = MortonInterleave(x, y);
+ const u8* source_ptr = source + morton_offset / 2;
+
+ u8 a = (morton_offset % 2) ? ((*source_ptr & 0xF0) >> 4) : (*source_ptr & 0xF);
+ a = Color::Convert4To8(a);
+
+ if (disable_alpha) {
+ return {a, a, a, 255};
+ } else {
+ return {0, 0, 0, a};
+ }
+ }
+
+ case Regs::TextureFormat::ETC1:
+ case Regs::TextureFormat::ETC1A4: {
+ bool has_alpha = (info.format == Regs::TextureFormat::ETC1A4);
+ size_t subtile_size = has_alpha ? 16 : 8;
+
+ // ETC1 further subdivides each 8x8 tile into four 4x4 subtiles
+ constexpr unsigned int subtile_width = 4;
+ constexpr unsigned int subtile_height = 4;
+
+ unsigned int subtile_index = (x / subtile_width) + 2 * (y / subtile_height);
+ x %= subtile_width;
+ y %= subtile_height;
+
+ const u8* subtile_ptr = source + subtile_index * subtile_size;
+
+ u8 alpha = 255;
+ if (has_alpha) {
+ u64_le packed_alpha;
+ memcpy(&packed_alpha, subtile_ptr, sizeof(u64));
+ subtile_ptr += sizeof(u64);
+
+ alpha = Color::Convert4To8((packed_alpha >> (4 * (x * subtile_width + y))) & 0xF);
+ }
+
+ u64_le subtile_data;
+ memcpy(&subtile_data, subtile_ptr, sizeof(u64));
+
+ return Math::MakeVec(SampleETC1Subtile(subtile_data, x, y),
+ disable_alpha ? (u8)255 : alpha);
+ }
+
+ default:
+ LOG_ERROR(HW_GPU, "Unknown texture format: %x", (u32)info.format);
+ DEBUG_ASSERT(false);
+ return {};
+ }
+}
+
+TextureInfo TextureInfo::FromPicaRegister(const Regs::TextureConfig& config,
+ const Regs::TextureFormat& format) {
+ TextureInfo info;
+ info.physical_address = config.GetPhysicalAddress();
+ info.width = config.width;
+ info.height = config.height;
+ info.format = format;
+ info.SetDefaultStride();
+ return info;
+}
+
+} // namespace Texture
+} // namespace Pica
diff --git a/src/video_core/texture/texture_decode.h b/src/video_core/texture/texture_decode.h
new file mode 100644
index 000000000..5c636939a
--- /dev/null
+++ b/src/video_core/texture/texture_decode.h
@@ -0,0 +1,60 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+#include "common/vector_math.h"
+#include "video_core/pica.h"
+
+namespace Pica {
+namespace Texture {
+
+/// Returns the byte size of a 8*8 tile of the specified texture format.
+size_t CalculateTileSize(Pica::Regs::TextureFormat format);
+
+struct TextureInfo {
+ PAddr physical_address;
+ unsigned int width;
+ unsigned int height;
+ ptrdiff_t stride;
+ Pica::Regs::TextureFormat format;
+
+ static TextureInfo FromPicaRegister(const Pica::Regs::TextureConfig& config,
+ const Pica::Regs::TextureFormat& format);
+
+ /// Calculates stride from format and width, assuming that the entire texture is contiguous.
+ void SetDefaultStride() {
+ stride = Pica::Texture::CalculateTileSize(format) * (width / 8);
+ }
+};
+
+/**
+ * Lookup texel located at the given coordinates and return an RGBA vector of its color.
+ * @param source Source pointer to read data from
+ * @param x,y Texture coordinates to read from
+ * @param info TextureInfo object describing the texture setup
+ * @param disable_alpha This is used for debug widgets which use this method to display textures
+ * without providing a good way to visualize alpha by themselves. If true, this will return 255 for
+ * the alpha component, and either drop the information entirely or store it in an "unused" color
+ * channel.
+ * @todo Eventually we should get rid of the disable_alpha parameter.
+ */
+Math::Vec4<u8> LookupTexture(const u8* source, unsigned int x, unsigned int y,
+ const TextureInfo& info, bool disable_alpha = false);
+
+/**
+ * Looks up a texel from a single 8x8 texture tile.
+ *
+ * @param source Pointer to the beginning of the tile.
+ * @param x, y In-tile coordinates to read from. Must be < 8.
+ * @param info TextureInfo describing the texture format.
+ * @param disable_alpha Used for debugging. Sets the result alpha to 255 and either discards the
+ * real alpha or inserts it in an otherwise unused channel.
+ */
+Math::Vec4<u8> LookupTexelInTile(const u8* source, unsigned int x, unsigned int y,
+ const TextureInfo& info, bool disable_alpha);
+
+} // namespace Texture
+} // namespace Pica
diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp
index 2b8ef7018..bf83b61ca 100644
--- a/src/video_core/vertex_loader.cpp
+++ b/src/video_core/vertex_loader.cpp
@@ -70,7 +70,8 @@ void VertexLoader::Setup(const Pica::Regs& regs) {
is_setup = true;
}
-void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input,
+void VertexLoader::LoadVertex(u32 base_address, int index, int vertex,
+ Shader::AttributeBuffer& input,
DebugUtils::MemoryAccessTracker& memory_accesses) {
ASSERT_MSG(is_setup, "A VertexLoader needs to be setup before loading vertices.");
@@ -142,7 +143,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::I
input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
} else if (vertex_attribute_is_default[i]) {
// Load the default attribute if we're configured to do so
- input.attr[i] = g_state.vs_default_attributes[i];
+ input.attr[i] = g_state.input_default_attributes.attr[i];
LOG_TRACE(HW_GPU,
"Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", i,
vertex, index, input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
diff --git a/src/video_core/vertex_loader.h b/src/video_core/vertex_loader.h
index 9f2098bb2..51f3d45b4 100644
--- a/src/video_core/vertex_loader.h
+++ b/src/video_core/vertex_loader.h
@@ -11,7 +11,7 @@ class MemoryAccessTracker;
}
namespace Shader {
-struct InputVertex;
+struct AttributeBuffer;
}
class VertexLoader {
@@ -22,7 +22,7 @@ public:
}
void Setup(const Pica::Regs& regs);
- void LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input,
+ void LoadVertex(u32 base_address, int index, int vertex, Shader::AttributeBuffer& input,
DebugUtils::MemoryAccessTracker& memory_accesses);
int GetNumTotalAttributes() const {
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index 8db882f59..7186a7652 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -19,7 +19,6 @@ std::unique_ptr<RendererBase> g_renderer; ///< Renderer plugin
std::atomic<bool> g_hw_renderer_enabled;
std::atomic<bool> g_shader_jit_enabled;
-std::atomic<bool> g_scaled_resolution_enabled;
std::atomic<bool> g_vsync_enabled;
std::atomic<bool> g_toggle_framelimit_enabled;
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index c397c1974..4aba19ca0 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -37,7 +37,6 @@ extern EmuWindow* g_emu_window; ///< Emu window
// qt ui)
extern std::atomic<bool> g_hw_renderer_enabled;
extern std::atomic<bool> g_shader_jit_enabled;
-extern std::atomic<bool> g_scaled_resolution_enabled;
extern std::atomic<bool> g_toggle_framelimit_enabled;
/// Start the video core