summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/citra_qt/configure.ui12
-rw-r--r--src/common/file_util.cpp43
-rw-r--r--src/common/file_util.h26
-rw-r--r--src/common/thread.h46
-rw-r--r--src/common/x64/emitter.cpp28
-rw-r--r--src/common/x64/emitter.h2
-rw-r--r--src/core/hle/config_mem.cpp7
-rw-r--r--src/core/hle/hle.cpp2
-rw-r--r--src/core/hle/service/soc_u.cpp100
-rw-r--r--src/core/hw/y2r.cpp2
-rw-r--r--src/core/loader/3dsx.cpp6
-rw-r--r--src/core/loader/ncch.cpp4
-rw-r--r--src/video_core/command_processor.cpp4
-rw-r--r--src/video_core/debug_utils/debug_utils.cpp19
-rw-r--r--src/video_core/rasterizer.cpp99
-rw-r--r--src/video_core/shader/shader.cpp34
-rw-r--r--src/video_core/shader/shader.h3
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp302
-rw-r--r--src/video_core/shader/shader_jit_x64.h58
19 files changed, 474 insertions, 323 deletions
diff --git a/src/citra_qt/configure.ui b/src/citra_qt/configure.ui
index 3c1f2ebba..6ae056ff9 100644
--- a/src/citra_qt/configure.ui
+++ b/src/citra_qt/configure.ui
@@ -10,24 +10,12 @@
<height>501</height>
</rect>
</property>
- <property name="minimumSize">
- <size>
- <width>370</width>
- <height>219</height>
- </size>
- </property>
<property name="windowTitle">
<string>Citra Configuration</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QTabWidget" name="tabWidget">
- <property name="minimumSize">
- <size>
- <width>371</width>
- <height>221</height>
- </size>
- </property>
<property name="currentIndex">
<number>0</number>
</property>
diff --git a/src/common/file_util.cpp b/src/common/file_util.cpp
index 687b7ae5a..6e2867658 100644
--- a/src/common/file_util.cpp
+++ b/src/common/file_util.cpp
@@ -833,13 +833,12 @@ size_t WriteStringToFile(bool text_file, const std::string &str, const char *fil
size_t ReadFileToString(bool text_file, const char *filename, std::string &str)
{
- FileUtil::IOFile file(filename, text_file ? "r" : "rb");
- auto const f = file.GetHandle();
+ IOFile file(filename, text_file ? "r" : "rb");
- if (!f)
+ if (!file)
return false;
- str.resize(static_cast<u32>(GetSize(f)));
+ str.resize(static_cast<u32>(file.GetSize()));
return file.ReadArray(&str[0], str.size());
}
@@ -886,15 +885,10 @@ void SplitFilename83(const std::string& filename, std::array<char, 9>& short_nam
}
IOFile::IOFile()
- : m_file(nullptr), m_good(true)
-{}
-
-IOFile::IOFile(std::FILE* file)
- : m_file(file), m_good(true)
-{}
+{
+}
IOFile::IOFile(const std::string& filename, const char openmode[])
- : m_file(nullptr), m_good(true)
{
Open(filename, openmode);
}
@@ -905,7 +899,6 @@ IOFile::~IOFile()
}
IOFile::IOFile(IOFile&& other)
- : m_file(nullptr), m_good(true)
{
Swap(other);
}
@@ -944,26 +937,12 @@ bool IOFile::Close()
return m_good;
}
-std::FILE* IOFile::ReleaseHandle()
-{
- std::FILE* const ret = m_file;
- m_file = nullptr;
- return ret;
-}
-
-void IOFile::SetHandle(std::FILE* file)
-{
- Close();
- Clear();
- m_file = file;
-}
-
-u64 IOFile::GetSize()
+u64 IOFile::GetSize() const
{
if (IsOpen())
return FileUtil::GetSize(m_file);
- else
- return 0;
+
+ return 0;
}
bool IOFile::Seek(s64 off, int origin)
@@ -974,12 +953,12 @@ bool IOFile::Seek(s64 off, int origin)
return m_good;
}
-u64 IOFile::Tell()
+u64 IOFile::Tell() const
{
if (IsOpen())
return ftello(m_file);
- else
- return -1;
+
+ return -1;
}
bool IOFile::Flush()
diff --git a/src/common/file_util.h b/src/common/file_util.h
index 880b8a1e3..b54a9fb72 100644
--- a/src/common/file_util.h
+++ b/src/common/file_util.h
@@ -176,7 +176,6 @@ class IOFile : public NonCopyable
{
public:
IOFile();
- explicit IOFile(std::FILE* file);
IOFile(const std::string& filename, const char openmode[]);
~IOFile();
@@ -192,6 +191,9 @@ public:
template <typename T>
size_t ReadArray(T* data, size_t length)
{
+ static_assert(std::is_standard_layout<T>(), "Given array does not consist of standard layout objects");
+ static_assert(std::is_trivially_copyable<T>(), "Given array does not consist of trivially copyable objects");
+
if (!IsOpen()) {
m_good = false;
return -1;
@@ -207,9 +209,8 @@ public:
template <typename T>
size_t WriteArray(const T* data, size_t length)
{
- static_assert(std::is_standard_layout<T>::value, "Given array does not consist of standard layout objects");
- // TODO: gcc 4.8 does not support is_trivially_copyable, but we really should check for it here.
- //static_assert(std::is_trivially_copyable<T>::value, "Given array does not consist of trivially copyable objects");
+ static_assert(std::is_standard_layout<T>(), "Given array does not consist of standard layout objects");
+ static_assert(std::is_trivially_copyable<T>(), "Given array does not consist of trivially copyable objects");
if (!IsOpen()) {
m_good = false;
@@ -243,25 +244,20 @@ public:
// m_good is set to false when a read, write or other function fails
bool IsGood() const { return m_good; }
- operator void*() { return m_good ? m_file : nullptr; }
-
- std::FILE* ReleaseHandle();
-
- std::FILE* GetHandle() { return m_file; }
-
- void SetHandle(std::FILE* file);
+ explicit operator bool() const { return IsGood(); }
bool Seek(s64 off, int origin);
- u64 Tell();
- u64 GetSize();
+ u64 Tell() const;
+ u64 GetSize() const;
bool Resize(u64 size);
bool Flush();
// clear error state
void Clear() { m_good = true; std::clearerr(m_file); }
- std::FILE* m_file;
- bool m_good;
+private:
+ std::FILE* m_file = nullptr;
+ bool m_good = true;
};
} // namespace
diff --git a/src/common/thread.h b/src/common/thread.h
index 8255ee6d3..bbfa8befa 100644
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -30,8 +30,7 @@
# endif
#endif
-namespace Common
-{
+namespace Common {
int CurrentThreadId();
@@ -43,55 +42,55 @@ public:
Event() : is_set(false) {}
void Set() {
- std::lock_guard<std::mutex> lk(m_mutex);
+ std::lock_guard<std::mutex> lk(mutex);
if (!is_set) {
is_set = true;
- m_condvar.notify_one();
+ condvar.notify_one();
}
}
void Wait() {
- std::unique_lock<std::mutex> lk(m_mutex);
- m_condvar.wait(lk, [&]{ return is_set; });
+ std::unique_lock<std::mutex> lk(mutex);
+ condvar.wait(lk, [&]{ return is_set; });
is_set = false;
}
void Reset() {
- std::unique_lock<std::mutex> lk(m_mutex);
+ std::unique_lock<std::mutex> lk(mutex);
// no other action required, since wait loops on the predicate and any lingering signal will get cleared on the first iteration
is_set = false;
}
private:
bool is_set;
- std::condition_variable m_condvar;
- std::mutex m_mutex;
+ std::condition_variable condvar;
+ std::mutex mutex;
};
class Barrier {
public:
- Barrier(size_t count) : m_count(count), m_waiting(0) {}
+ explicit Barrier(size_t count_) : count(count_), waiting(0), generation(0) {}
/// Blocks until all "count" threads have called Sync()
void Sync() {
- std::unique_lock<std::mutex> lk(m_mutex);
+ std::unique_lock<std::mutex> lk(mutex);
+ const size_t current_generation = generation;
- // TODO: broken when next round of Sync()s
- // is entered before all waiting threads return from the notify_all
-
- if (++m_waiting == m_count) {
- m_waiting = 0;
- m_condvar.notify_all();
+ if (++waiting == count) {
+ generation++;
+ waiting = 0;
+ condvar.notify_all();
} else {
- m_condvar.wait(lk, [&]{ return m_waiting == 0; });
+ condvar.wait(lk, [this, current_generation]{ return current_generation != generation; });
}
}
private:
- std::condition_variable m_condvar;
- std::mutex m_mutex;
- const size_t m_count;
- size_t m_waiting;
+ std::condition_variable condvar;
+ std::mutex mutex;
+ const size_t count;
+ size_t waiting;
+ size_t generation; // Incremented once each time the barrier is used
};
void SleepCurrentThread(int ms);
@@ -100,8 +99,7 @@ void SwitchCurrentThread(); // On Linux, this is equal to sleep 1ms
// Use this function during a spin-wait to make the current thread
// relax while another thread is working. This may be more efficient
// than using events because event functions use kernel calls.
-inline void YieldCPU()
-{
+inline void YieldCPU() {
std::this_thread::yield();
}
diff --git a/src/common/x64/emitter.cpp b/src/common/x64/emitter.cpp
index 1dcf2416c..5662f7f86 100644
--- a/src/common/x64/emitter.cpp
+++ b/src/common/x64/emitter.cpp
@@ -455,6 +455,18 @@ void XEmitter::CALL(const void* fnptr)
Write32(u32(distance));
}
+FixupBranch XEmitter::CALL()
+{
+ FixupBranch branch;
+ branch.type = 1;
+ branch.ptr = code + 5;
+
+ Write8(0xE8);
+ Write32(0);
+
+ return branch;
+}
+
FixupBranch XEmitter::J(bool force5bytes)
{
FixupBranch branch;
@@ -531,6 +543,22 @@ void XEmitter::SetJumpTarget(const FixupBranch& branch)
}
}
+void XEmitter::SetJumpTarget(const FixupBranch& branch, const u8* target)
+{
+ if (branch.type == 0)
+ {
+ s64 distance = (s64)(target - branch.ptr);
+ ASSERT_MSG(distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true");
+ branch.ptr[-1] = (u8)(s8)distance;
+ }
+ else if (branch.type == 1)
+ {
+ s64 distance = (s64)(target - branch.ptr);
+ ASSERT_MSG(distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register");
+ ((s32*)branch.ptr)[-1] = (s32)distance;
+ }
+}
+
//Single byte opcodes
//There is no PUSHAD/POPAD in 64-bit mode.
void XEmitter::INT3() {Write8(0xCC);}
diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h
index 7c6548fb5..a33724146 100644
--- a/src/common/x64/emitter.h
+++ b/src/common/x64/emitter.h
@@ -425,12 +425,14 @@ public:
#undef CALL
#endif
void CALL(const void* fnptr);
+ FixupBranch CALL();
void CALLptr(OpArg arg);
FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
void J_CC(CCFlags conditionCode, const u8* addr, bool force5Bytes = false);
void SetJumpTarget(const FixupBranch& branch);
+ void SetJumpTarget(const FixupBranch& branch, const u8* target);
void SETcc(CCFlags flag, OpArg dest);
// Note: CMOV brings small if any benefit on current cpus.
diff --git a/src/core/hle/config_mem.cpp b/src/core/hle/config_mem.cpp
index b1a72dc0c..ccd73cfcb 100644
--- a/src/core/hle/config_mem.cpp
+++ b/src/core/hle/config_mem.cpp
@@ -3,13 +3,6 @@
// Refer to the license.txt file included.
#include <cstring>
-
-#include "common/assert.h"
-#include "common/common_types.h"
-#include "common/common_funcs.h"
-
-#include "core/core.h"
-#include "core/memory.h"
#include "core/hle/config_mem.h"
////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/hle/hle.cpp b/src/core/hle/hle.cpp
index 331b1b22a..e545de3b5 100644
--- a/src/core/hle/hle.cpp
+++ b/src/core/hle/hle.cpp
@@ -8,8 +8,6 @@
#include "core/arm/arm_interface.h"
#include "core/core.h"
#include "core/hle/hle.h"
-#include "core/hle/config_mem.h"
-#include "core/hle/shared_page.h"
#include "core/hle/service/service.h"
////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/hle/service/soc_u.cpp b/src/core/hle/service/soc_u.cpp
index ff0af8f12..d3e5d4bca 100644
--- a/src/core/hle/service/soc_u.cpp
+++ b/src/core/hle/service/soc_u.cpp
@@ -151,6 +151,34 @@ static int TranslateError(int error) {
return error;
}
+/// Holds the translation from system network socket options to 3DS network socket options
+/// Note: -1 = No effect/unavailable
+static const std::unordered_map<int, int> sockopt_map = { {
+ { 0x0004, SO_REUSEADDR },
+ { 0x0080, -1 },
+ { 0x0100, -1 },
+ { 0x1001, SO_SNDBUF },
+ { 0x1002, SO_RCVBUF },
+ { 0x1003, -1 },
+#ifdef _WIN32
+ /// Unsupported in WinSock2
+ { 0x1004, -1 },
+#else
+ { 0x1004, SO_RCVLOWAT },
+#endif
+ { 0x1008, SO_TYPE },
+ { 0x1009, SO_ERROR },
+}};
+
+/// Converts a socket option from 3ds-specific to platform-specific
+static int TranslateSockOpt(int console_opt_name) {
+ auto found = sockopt_map.find(console_opt_name);
+ if (found != sockopt_map.end()) {
+ return found->second;
+ }
+ return console_opt_name;
+}
+
/// Holds information about a particular socket
struct SocketHolder {
u32 socket_fd; ///< The socket descriptor
@@ -568,7 +596,7 @@ static void RecvFrom(Service::Interface* self) {
socklen_t src_addr_len = sizeof(src_addr);
int ret = ::recvfrom(socket_handle, (char*)output_buff, len, flags, &src_addr, &src_addr_len);
- if (buffer_parameters.output_src_address_buffer != 0) {
+ if (ret >= 0 && buffer_parameters.output_src_address_buffer != 0 && src_addr_len > 0) {
CTRSockAddr* ctr_src_addr = reinterpret_cast<CTRSockAddr*>(Memory::GetPointer(buffer_parameters.output_src_address_buffer));
*ctr_src_addr = CTRSockAddr::FromPlatform(src_addr);
}
@@ -724,6 +752,72 @@ static void ShutdownSockets(Service::Interface* self) {
cmd_buffer[1] = 0;
}
+static void GetSockOpt(Service::Interface* self) {
+ u32* cmd_buffer = Kernel::GetCommandBuffer();
+ u32 socket_handle = cmd_buffer[1];
+ u32 level = cmd_buffer[2];
+ int optname = TranslateSockOpt(cmd_buffer[3]);
+ socklen_t optlen = (socklen_t)cmd_buffer[4];
+
+ int ret = -1;
+ int err = 0;
+
+ if(optname < 0) {
+#ifdef _WIN32
+ err = WSAEINVAL;
+#else
+ err = EINVAL;
+#endif
+ } else {
+ // 0x100 = static buffer offset (bytes)
+ // + 0x4 = 2nd pointer (u32) position
+ // >> 2 = convert to u32 offset instead of byte offset (cmd_buffer = u32*)
+ char* optval = reinterpret_cast<char *>(Memory::GetPointer(cmd_buffer[0x104 >> 2]));
+
+ ret = ::getsockopt(socket_handle, level, optname, optval, &optlen);
+ err = 0;
+ if (ret == SOCKET_ERROR_VALUE) {
+ err = TranslateError(GET_ERRNO);
+ }
+ }
+
+ cmd_buffer[0] = IPC::MakeHeader(0x11, 4, 2);
+ cmd_buffer[1] = ret;
+ cmd_buffer[2] = err;
+ cmd_buffer[3] = optlen;
+}
+
+static void SetSockOpt(Service::Interface* self) {
+ u32* cmd_buffer = Kernel::GetCommandBuffer();
+ u32 socket_handle = cmd_buffer[1];
+ u32 level = cmd_buffer[2];
+ int optname = TranslateSockOpt(cmd_buffer[3]);
+
+ int ret = -1;
+ int err = 0;
+
+ if(optname < 0) {
+#ifdef _WIN32
+ err = WSAEINVAL;
+#else
+ err = EINVAL;
+#endif
+ } else {
+ socklen_t optlen = static_cast<socklen_t>(cmd_buffer[4]);
+ const char* optval = reinterpret_cast<const char *>(Memory::GetPointer(cmd_buffer[8]));
+
+ ret = static_cast<u32>(::setsockopt(socket_handle, level, optname, optval, optlen));
+ err = 0;
+ if (ret == SOCKET_ERROR_VALUE) {
+ err = TranslateError(GET_ERRNO);
+ }
+ }
+
+ cmd_buffer[0] = IPC::MakeHeader(0x12, 4, 4);
+ cmd_buffer[1] = ret;
+ cmd_buffer[2] = err;
+}
+
const Interface::FunctionInfo FunctionTable[] = {
{0x00010044, InitializeSockets, "InitializeSockets"},
{0x000200C2, Socket, "Socket"},
@@ -741,8 +835,8 @@ const Interface::FunctionInfo FunctionTable[] = {
{0x000E00C2, nullptr, "GetHostByAddr"},
{0x000F0106, nullptr, "GetAddrInfo"},
{0x00100102, nullptr, "GetNameInfo"},
- {0x00110102, nullptr, "GetSockOpt"},
- {0x00120104, nullptr, "SetSockOpt"},
+ {0x00110102, GetSockOpt, "GetSockOpt"},
+ {0x00120104, SetSockOpt, "SetSockOpt"},
{0x001300C2, Fcntl, "Fcntl"},
{0x00140084, Poll, "Poll"},
{0x00150042, nullptr, "SockAtMark"},
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp
index 48c45564f..083391e83 100644
--- a/src/core/hw/y2r.cpp
+++ b/src/core/hw/y2r.cpp
@@ -261,7 +261,7 @@ void PerformConversion(ConversionConfiguration& cvt) {
ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0);
// Tiles per row
size_t num_tiles = cvt.input_line_width / 8;
- ASSERT(num_tiles < MAX_TILES);
+ ASSERT(num_tiles <= MAX_TILES);
// Buffer used as a CDMA source/target.
std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
diff --git a/src/core/loader/3dsx.cpp b/src/core/loader/3dsx.cpp
index 8eed6a50a..5fb3b9e2b 100644
--- a/src/core/loader/3dsx.cpp
+++ b/src/core/loader/3dsx.cpp
@@ -10,13 +10,9 @@
#include "core/file_sys/archive_romfs.h"
#include "core/hle/kernel/process.h"
#include "core/hle/kernel/resource_limit.h"
-#include "core/hle/service/fs/archive.h"
-#include "core/loader/elf.h"
-#include "core/loader/ncch.h"
+#include "core/loader/3dsx.h"
#include "core/memory.h"
-#include "3dsx.h"
-
namespace Loader {
/*
diff --git a/src/core/loader/ncch.cpp b/src/core/loader/ncch.cpp
index e63cab33f..a4b47ef8c 100644
--- a/src/core/loader/ncch.cpp
+++ b/src/core/loader/ncch.cpp
@@ -174,7 +174,7 @@ ResultStatus AppLoader_NCCH::LoadSectionExeFS(const char* name, std::vector<u8>&
return ResultStatus::Error;
LOG_DEBUG(Loader, "%d sections:", kMaxSections);
- // Iterate through the ExeFs archive until we find the .code file...
+ // Iterate through the ExeFs archive until we find a section with the specified name...
for (unsigned section_number = 0; section_number < kMaxSections; section_number++) {
const auto& section = exefs_header.section[section_number];
@@ -186,7 +186,7 @@ ResultStatus AppLoader_NCCH::LoadSectionExeFS(const char* name, std::vector<u8>&
s64 section_offset = (section.offset + exefs_offset + sizeof(ExeFs_Header) + ncch_offset);
file.Seek(section_offset, SEEK_SET);
- if (is_compressed) {
+ if (strcmp(section.name, ".code") == 0 && is_compressed) {
// Section is compressed, read compressed .code section...
std::unique_ptr<u8[]> temp_buffer;
try {
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 08ec2907a..3abe79c09 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -140,7 +140,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
immediate_attribute_id = 0;
Shader::UnitState<false> shader_unit;
- Shader::Setup(shader_unit);
+ Shader::Setup();
if (g_debug_context)
g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, static_cast<void*>(&immediate_input));
@@ -300,7 +300,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
vertex_cache_ids.fill(-1);
Shader::UnitState<false> shader_unit;
- Shader::Setup(shader_unit);
+ Shader::Setup();
for (unsigned int index = 0; index < regs.num_vertices; ++index)
{
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 693f93597..c3a9c9598 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -286,7 +286,7 @@ void StartPicaTracing()
}
std::lock_guard<std::mutex> lock(pica_trace_mutex);
- pica_trace = std::unique_ptr<PicaTrace>(new PicaTrace);
+ pica_trace = std::make_unique<PicaTrace>();
is_pica_tracing = true;
}
@@ -586,6 +586,21 @@ TextureInfo TextureInfo::FromPicaRegister(const Regs::TextureConfig& config,
return info;
}
+#ifdef HAVE_PNG
+// Adapter functions to libpng to write/flush to File::IOFile instances.
+static void WriteIOFile(png_structp png_ptr, png_bytep data, png_size_t length) {
+ auto* fp = static_cast<FileUtil::IOFile*>(png_get_io_ptr(png_ptr));
+ if (!fp->WriteBytes(data, length))
+ png_error(png_ptr, "Failed to write to output PNG file.");
+}
+
+static void FlushIOFile(png_structp png_ptr) {
+ auto* fp = static_cast<FileUtil::IOFile*>(png_get_io_ptr(png_ptr));
+ if (!fp->Flush())
+ png_error(png_ptr, "Failed to flush to output PNG file.");
+}
+#endif
+
void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
#ifndef HAVE_PNG
return;
@@ -629,7 +644,7 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
goto finalise;
}
- png_init_io(png_ptr, fp.GetHandle());
+ png_set_write_fn(png_ptr, static_cast<void*>(&fp), WriteIOFile, FlushIOFile);
// Write header (8 bit color depth)
png_set_IHDR(png_ptr, info_ptr, texture_config.width, texture_config.height,
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 5b9ed7c64..0434ad05a 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -923,92 +923,72 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
if (output_merger.alphablend_enable) {
auto params = output_merger.alpha_blending;
- auto LookupFactorRGB = [&](Regs::BlendFactor factor) -> Math::Vec3<u8> {
+ auto LookupFactor = [&](unsigned channel, Regs::BlendFactor factor) -> u8 {
+ DEBUG_ASSERT(channel < 4);
+
+ const Math::Vec4<u8> blend_const = {
+ static_cast<u8>(output_merger.blend_const.r),
+ static_cast<u8>(output_merger.blend_const.g),
+ static_cast<u8>(output_merger.blend_const.b),
+ static_cast<u8>(output_merger.blend_const.a)
+ };
+
switch (factor) {
- case Regs::BlendFactor::Zero :
- return Math::Vec3<u8>(0, 0, 0);
+ case Regs::BlendFactor::Zero:
+ return 0;
- case Regs::BlendFactor::One :
- return Math::Vec3<u8>(255, 255, 255);
+ case Regs::BlendFactor::One:
+ return 255;
case Regs::BlendFactor::SourceColor:
- return combiner_output.rgb();
+ return combiner_output[channel];
case Regs::BlendFactor::OneMinusSourceColor:
- return Math::Vec3<u8>(255 - combiner_output.r(), 255 - combiner_output.g(), 255 - combiner_output.b());
+ return 255 - combiner_output[channel];
case Regs::BlendFactor::DestColor:
- return dest.rgb();
+ return dest[channel];
case Regs::BlendFactor::OneMinusDestColor:
- return Math::Vec3<u8>(255 - dest.r(), 255 - dest.g(), 255 - dest.b());
+ return 255 - dest[channel];
case Regs::BlendFactor::SourceAlpha:
- return Math::Vec3<u8>(combiner_output.a(), combiner_output.a(), combiner_output.a());
+ return combiner_output.a();
case Regs::BlendFactor::OneMinusSourceAlpha:
- return Math::Vec3<u8>(255 - combiner_output.a(), 255 - combiner_output.a(), 255 - combiner_output.a());
+ return 255 - combiner_output.a();
case Regs::BlendFactor::DestAlpha:
- return Math::Vec3<u8>(dest.a(), dest.a(), dest.a());
+ return dest.a();
case Regs::BlendFactor::OneMinusDestAlpha:
- return Math::Vec3<u8>(255 - dest.a(), 255 - dest.a(), 255 - dest.a());
+ return 255 - dest.a();
case Regs::BlendFactor::ConstantColor:
- return Math::Vec3<u8>(output_merger.blend_const.r, output_merger.blend_const.g, output_merger.blend_const.b);
+ return blend_const[channel];
case Regs::BlendFactor::OneMinusConstantColor:
- return Math::Vec3<u8>(255 - output_merger.blend_const.r, 255 - output_merger.blend_const.g, 255 - output_merger.blend_const.b);
+ return 255 - blend_const[channel];
case Regs::BlendFactor::ConstantAlpha:
- return Math::Vec3<u8>(output_merger.blend_const.a, output_merger.blend_const.a, output_merger.blend_const.a);
+ return blend_const.a();
case Regs::BlendFactor::OneMinusConstantAlpha:
- return Math::Vec3<u8>(255 - output_merger.blend_const.a, 255 - output_merger.blend_const.a, 255 - output_merger.blend_const.a);
-
- default:
- LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
- UNIMPLEMENTED();
- break;
- }
-
- return {};
- };
-
- auto LookupFactorA = [&](Regs::BlendFactor factor) -> u8 {
- switch (factor) {
- case Regs::BlendFactor::Zero:
- return 0;
-
- case Regs::BlendFactor::One:
- return 255;
-
- case Regs::BlendFactor::SourceAlpha:
- return combiner_output.a();
-
- case Regs::BlendFactor::OneMinusSourceAlpha:
- return 255 - combiner_output.a();
+ return 255 - blend_const.a();
- case Regs::BlendFactor::DestAlpha:
- return dest.a();
-
- case Regs::BlendFactor::OneMinusDestAlpha:
- return 255 - dest.a();
-
- case Regs::BlendFactor::ConstantAlpha:
- return output_merger.blend_const.a;
-
- case Regs::BlendFactor::OneMinusConstantAlpha:
- return 255 - output_merger.blend_const.a;
+ case Regs::BlendFactor::SourceAlphaSaturate:
+ // Returns 1.0 for the alpha channel
+ if (channel == 3)
+ return 255;
+ return std::min(combiner_output.a(), static_cast<u8>(255 - dest.a()));
default:
- LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
+ LOG_CRITICAL(HW_GPU, "Unknown blend factor %x", factor);
UNIMPLEMENTED();
break;
}
- return {};
+ return combiner_output[channel];
};
static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor,
@@ -1060,10 +1040,15 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
MathUtil::Clamp(result.a(), 0, 255));
};
- auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
- LookupFactorA(params.factor_source_a));
- auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
- LookupFactorA(params.factor_dest_a));
+ auto srcfactor = Math::MakeVec(LookupFactor(0, params.factor_source_rgb),
+ LookupFactor(1, params.factor_source_rgb),
+ LookupFactor(2, params.factor_source_rgb),
+ LookupFactor(3, params.factor_source_a));
+
+ auto dstfactor = Math::MakeVec(LookupFactor(0, params.factor_dest_rgb),
+ LookupFactor(1, params.factor_dest_rgb),
+ LookupFactor(2, params.factor_dest_rgb),
+ LookupFactor(3, params.factor_dest_a));
blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb);
blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a();
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 78d295c76..75301accd 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -28,36 +28,24 @@ namespace Pica {
namespace Shader {
#ifdef ARCHITECTURE_x86_64
-static std::unordered_map<u64, CompiledShader*> shader_map;
-static JitCompiler jit;
-static CompiledShader* jit_shader;
-
-static void ClearCache() {
- shader_map.clear();
- jit.Clear();
- LOG_INFO(HW_GPU, "Shader JIT cache cleared");
-}
+static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map;
+static const JitShader* jit_shader;
#endif // ARCHITECTURE_x86_64
-void Setup(UnitState<false>& state) {
+void Setup() {
#ifdef ARCHITECTURE_x86_64
if (VideoCore::g_shader_jit_enabled) {
u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
- Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^
- g_state.regs.vs.main_offset);
+ Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)));
auto iter = shader_map.find(cache_key);
if (iter != shader_map.end()) {
- jit_shader = iter->second;
+ jit_shader = iter->second.get();
} else {
- // Check if remaining JIT code space is enough for at least one more (massive) shader
- if (jit.GetSpaceLeft() < jit_shader_size) {
- // If not, clear the cache of all previously compiled shaders
- ClearCache();
- }
-
- jit_shader = jit.Compile();
- shader_map.emplace(cache_key, jit_shader);
+ auto shader = std::make_unique<JitShader>();
+ shader->Compile();
+ jit_shader = shader.get();
+ shader_map[cache_key] = std::move(shader);
}
}
#endif // ARCHITECTURE_x86_64
@@ -65,7 +53,7 @@ void Setup(UnitState<false>& state) {
void Shutdown() {
#ifdef ARCHITECTURE_x86_64
- ClearCache();
+ shader_map.clear();
#endif // ARCHITECTURE_x86_64
}
@@ -109,7 +97,7 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
#ifdef ARCHITECTURE_x86_64
if (VideoCore::g_shader_jit_enabled)
- jit_shader(&state.registers);
+ jit_shader->Run(&state.registers, g_state.regs.vs.main_offset);
else
RunInterpreter(state);
#else
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 7af8f1fa1..9c5bd97bd 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -339,9 +339,8 @@ struct UnitState {
/**
* Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
* vertex, which would happen within the `Run` function).
- * @param state Shader unit state, must be setup per shader and per shader unit
*/
-void Setup(UnitState<false>& state);
+void Setup();
/// Performs any cleanup when the emulator is shutdown
void Shutdown();
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index dffe051ef..b47d3beda 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -2,6 +2,7 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include <algorithm>
#include <smmintrin.h>
#include "common/x64/abi.h"
@@ -19,73 +20,73 @@ namespace Shader {
using namespace Gen;
-typedef void (JitCompiler::*JitFunction)(Instruction instr);
+typedef void (JitShader::*JitFunction)(Instruction instr);
const JitFunction instr_table[64] = {
- &JitCompiler::Compile_ADD, // add
- &JitCompiler::Compile_DP3, // dp3
- &JitCompiler::Compile_DP4, // dp4
- &JitCompiler::Compile_DPH, // dph
+ &JitShader::Compile_ADD, // add
+ &JitShader::Compile_DP3, // dp3
+ &JitShader::Compile_DP4, // dp4
+ &JitShader::Compile_DPH, // dph
nullptr, // unknown
- &JitCompiler::Compile_EX2, // ex2
- &JitCompiler::Compile_LG2, // lg2
+ &JitShader::Compile_EX2, // ex2
+ &JitShader::Compile_LG2, // lg2
nullptr, // unknown
- &JitCompiler::Compile_MUL, // mul
- &JitCompiler::Compile_SGE, // sge
- &JitCompiler::Compile_SLT, // slt
- &JitCompiler::Compile_FLR, // flr
- &JitCompiler::Compile_MAX, // max
- &JitCompiler::Compile_MIN, // min
- &JitCompiler::Compile_RCP, // rcp
- &JitCompiler::Compile_RSQ, // rsq
+ &JitShader::Compile_MUL, // mul
+ &JitShader::Compile_SGE, // sge
+ &JitShader::Compile_SLT, // slt
+ &JitShader::Compile_FLR, // flr
+ &JitShader::Compile_MAX, // max
+ &JitShader::Compile_MIN, // min
+ &JitShader::Compile_RCP, // rcp
+ &JitShader::Compile_RSQ, // rsq
nullptr, // unknown
nullptr, // unknown
- &JitCompiler::Compile_MOVA, // mova
- &JitCompiler::Compile_MOV, // mov
+ &JitShader::Compile_MOVA, // mova
+ &JitShader::Compile_MOV, // mov
nullptr, // unknown
nullptr, // unknown
nullptr, // unknown
nullptr, // unknown
- &JitCompiler::Compile_DPH, // dphi
+ &JitShader::Compile_DPH, // dphi
nullptr, // unknown
- &JitCompiler::Compile_SGE, // sgei
- &JitCompiler::Compile_SLT, // slti
+ &JitShader::Compile_SGE, // sgei
+ &JitShader::Compile_SLT, // slti
nullptr, // unknown
nullptr, // unknown
nullptr, // unknown
nullptr, // unknown
nullptr, // unknown
- &JitCompiler::Compile_NOP, // nop
- &JitCompiler::Compile_END, // end
+ &JitShader::Compile_NOP, // nop
+ &JitShader::Compile_END, // end
nullptr, // break
- &JitCompiler::Compile_CALL, // call
- &JitCompiler::Compile_CALLC, // callc
- &JitCompiler::Compile_CALLU, // callu
- &JitCompiler::Compile_IF, // ifu
- &JitCompiler::Compile_IF, // ifc
- &JitCompiler::Compile_LOOP, // loop
+ &JitShader::Compile_CALL, // call
+ &JitShader::Compile_CALLC, // callc
+ &JitShader::Compile_CALLU, // callu
+ &JitShader::Compile_IF, // ifu
+ &JitShader::Compile_IF, // ifc
+ &JitShader::Compile_LOOP, // loop
nullptr, // emit
nullptr, // sete
- &JitCompiler::Compile_JMP, // jmpc
- &JitCompiler::Compile_JMP, // jmpu
- &JitCompiler::Compile_CMP, // cmp
- &JitCompiler::Compile_CMP, // cmp
- &JitCompiler::Compile_MAD, // madi
- &JitCompiler::Compile_MAD, // madi
- &JitCompiler::Compile_MAD, // madi
- &JitCompiler::Compile_MAD, // madi
- &JitCompiler::Compile_MAD, // madi
- &JitCompiler::Compile_MAD, // madi
- &JitCompiler::Compile_MAD, // madi
- &JitCompiler::Compile_MAD, // madi
- &JitCompiler::Compile_MAD, // mad
- &JitCompiler::Compile_MAD, // mad
- &JitCompiler::Compile_MAD, // mad
- &JitCompiler::Compile_MAD, // mad
- &JitCompiler::Compile_MAD, // mad
- &JitCompiler::Compile_MAD, // mad
- &JitCompiler::Compile_MAD, // mad
- &JitCompiler::Compile_MAD, // mad
+ &JitShader::Compile_JMP, // jmpc
+ &JitShader::Compile_JMP, // jmpu
+ &JitShader::Compile_CMP, // cmp
+ &JitShader::Compile_CMP, // cmp
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // madi
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
+ &JitShader::Compile_MAD, // mad
};
// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
@@ -138,13 +139,32 @@ static const u8 NO_SRC_REG_SWIZZLE = 0x1b;
static const u8 NO_DEST_REG_MASK = 0xf;
/**
+ * Get the vertex shader instruction for a given offset in the current shader program
+ * @param offset Offset in the current shader program of the instruction
+ * @return Instruction at the specified offset
+ */
+static Instruction GetVertexShaderInstruction(size_t offset) {
+ return { g_state.vs.program_code[offset] };
+}
+
+static void LogCritical(const char* msg) {
+ LOG_CRITICAL(HW_GPU, msg);
+}
+
+void JitShader::Compile_Assert(bool condition, const char* msg) {
+ if (!condition) {
+ ABI_CallFunctionP(reinterpret_cast<const void*>(LogCritical), const_cast<char*>(msg));
+ }
+}
+
+/**
* Loads and swizzles a source register into the specified XMM register.
* @param instr VS instruction, used for determining how to load the source register
* @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3)
* @param src_reg SourceRegister object corresponding to the source register to load
* @param dest Destination XMM register to store the loaded, swizzled source register
*/
-void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) {
+void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) {
X64Reg src_ptr;
size_t src_offset;
@@ -216,7 +236,7 @@ void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, Source
}
}
-void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
+void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
DestRegister dest;
unsigned operand_desc_id;
if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
@@ -263,7 +283,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
}
}
-void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
+void JitShader::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
MOVAPS(scratch, R(src1));
CMPPS(scratch, R(src2), CMP_ORD);
@@ -276,7 +296,7 @@ void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::
ANDPS(src1, R(scratch));
}
-void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
+void JitShader::Compile_EvaluateCondition(Instruction instr) {
// Note: NXOR is used below to check for equality
switch (instr.flow_control.op) {
case Instruction::FlowControlType::Or:
@@ -307,23 +327,23 @@ void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
}
}
-void JitCompiler::Compile_UniformCondition(Instruction instr) {
+void JitShader::Compile_UniformCondition(Instruction instr) {
int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool));
CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
}
-BitSet32 JitCompiler::PersistentCallerSavedRegs() {
+BitSet32 JitShader::PersistentCallerSavedRegs() {
return persistent_regs & ABI_ALL_CALLER_SAVED;
}
-void JitCompiler::Compile_ADD(Instruction instr) {
+void JitShader::Compile_ADD(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
ADDPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_DP3(Instruction instr) {
+void JitShader::Compile_DP3(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
@@ -342,7 +362,7 @@ void JitCompiler::Compile_DP3(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_DP4(Instruction instr) {
+void JitShader::Compile_DP4(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
@@ -359,7 +379,7 @@ void JitCompiler::Compile_DP4(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_DPH(Instruction instr) {
+void JitShader::Compile_DPH(Instruction instr) {
if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) {
Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
@@ -391,7 +411,7 @@ void JitCompiler::Compile_DPH(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_EX2(Instruction instr) {
+void JitShader::Compile_EX2(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
MOVSS(XMM0, R(SRC1));
@@ -404,7 +424,7 @@ void JitCompiler::Compile_EX2(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_LG2(Instruction instr) {
+void JitShader::Compile_LG2(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
MOVSS(XMM0, R(SRC1));
@@ -417,14 +437,14 @@ void JitCompiler::Compile_LG2(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_MUL(Instruction instr) {
+void JitShader::Compile_MUL(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_SGE(Instruction instr) {
+void JitShader::Compile_SGE(Instruction instr) {
if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
@@ -439,7 +459,7 @@ void JitCompiler::Compile_SGE(Instruction instr) {
Compile_DestEnable(instr, SRC2);
}
-void JitCompiler::Compile_SLT(Instruction instr) {
+void JitShader::Compile_SLT(Instruction instr) {
if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
@@ -454,7 +474,7 @@ void JitCompiler::Compile_SLT(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_FLR(Instruction instr) {
+void JitShader::Compile_FLR(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
if (Common::GetCPUCaps().sse4_1) {
@@ -467,7 +487,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_MAX(Instruction instr) {
+void JitShader::Compile_MAX(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
// SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
@@ -475,7 +495,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_MIN(Instruction instr) {
+void JitShader::Compile_MIN(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
// SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
@@ -483,7 +503,7 @@ void JitCompiler::Compile_MIN(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_MOVA(Instruction instr) {
+void JitShader::Compile_MOVA(Instruction instr) {
SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] };
if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
@@ -528,12 +548,12 @@ void JitCompiler::Compile_MOVA(Instruction instr) {
}
}
-void JitCompiler::Compile_MOV(Instruction instr) {
+void JitShader::Compile_MOV(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_RCP(Instruction instr) {
+void JitShader::Compile_RCP(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
// TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
@@ -544,7 +564,7 @@ void JitCompiler::Compile_RCP(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_RSQ(Instruction instr) {
+void JitShader::Compile_RSQ(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
// TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
@@ -555,36 +575,41 @@ void JitCompiler::Compile_RSQ(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_NOP(Instruction instr) {
+void JitShader::Compile_NOP(Instruction instr) {
}
-void JitCompiler::Compile_END(Instruction instr) {
+void JitShader::Compile_END(Instruction instr) {
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
RET();
}
-void JitCompiler::Compile_CALL(Instruction instr) {
- unsigned offset = instr.flow_control.dest_offset;
- while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) {
- Compile_NextInstr(&offset);
- }
+void JitShader::Compile_CALL(Instruction instr) {
+ // Push offset of the return
+ PUSH(64, Imm32(instr.flow_control.dest_offset + instr.flow_control.num_instructions));
+
+ // Call the subroutine
+ FixupBranch b = CALL();
+ fixup_branches.push_back({ b, instr.flow_control.dest_offset });
+
+ // Skip over the return offset that's on the stack
+ ADD(64, R(RSP), Imm32(8));
}
-void JitCompiler::Compile_CALLC(Instruction instr) {
+void JitShader::Compile_CALLC(Instruction instr) {
Compile_EvaluateCondition(instr);
FixupBranch b = J_CC(CC_Z, true);
Compile_CALL(instr);
SetJumpTarget(b);
}
-void JitCompiler::Compile_CALLU(Instruction instr) {
+void JitShader::Compile_CALLU(Instruction instr) {
Compile_UniformCondition(instr);
FixupBranch b = J_CC(CC_Z, true);
Compile_CALL(instr);
SetJumpTarget(b);
}
-void JitCompiler::Compile_CMP(Instruction instr) {
+void JitShader::Compile_CMP(Instruction instr) {
using Op = Instruction::Common::CompareOpType::Op;
Op op_x = instr.common.compare_op.x;
Op op_y = instr.common.compare_op.y;
@@ -627,7 +652,7 @@ void JitCompiler::Compile_CMP(Instruction instr) {
SHR(64, R(COND1), Imm8(63));
}
-void JitCompiler::Compile_MAD(Instruction instr) {
+void JitShader::Compile_MAD(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1);
if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
@@ -644,9 +669,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
Compile_DestEnable(instr, SRC1);
}
-void JitCompiler::Compile_IF(Instruction instr) {
- ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements (%d -> %d) not supported",
- *offset_ptr, instr.flow_control.dest_offset.Value());
+void JitShader::Compile_IF(Instruction instr) {
+ Compile_Assert(instr.flow_control.dest_offset >= program_counter, "Backwards if-statements not supported");
// Evaluate the "IF" condition
if (instr.opcode.Value() == OpCode::Id::IFU) {
@@ -676,10 +700,9 @@ void JitCompiler::Compile_IF(Instruction instr) {
SetJumpTarget(b2);
}
-void JitCompiler::Compile_LOOP(Instruction instr) {
- ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops (%d -> %d) not supported",
- *offset_ptr, instr.flow_control.dest_offset.Value());
- ASSERT_MSG(!looping, "Nested loops not supported");
+void JitShader::Compile_LOOP(Instruction instr) {
+ Compile_Assert(instr.flow_control.dest_offset >= program_counter, "Backwards loops not supported");
+ Compile_Assert(!looping, "Nested loops not supported");
looping = true;
@@ -705,10 +728,7 @@ void JitCompiler::Compile_LOOP(Instruction instr) {
looping = false;
}
-void JitCompiler::Compile_JMP(Instruction instr) {
- ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps (%d -> %d) not supported",
- *offset_ptr, instr.flow_control.dest_offset.Value());
-
+void JitShader::Compile_JMP(Instruction instr) {
if (instr.opcode.Value() == OpCode::Id::JMPC)
Compile_EvaluateCondition(instr);
else if (instr.opcode.Value() == OpCode::Id::JMPU)
@@ -718,30 +738,38 @@ void JitCompiler::Compile_JMP(Instruction instr) {
bool inverted_condition = (instr.opcode.Value() == OpCode::Id::JMPU) &&
(instr.flow_control.num_instructions & 1);
+
FixupBranch b = J_CC(inverted_condition ? CC_Z : CC_NZ, true);
+ fixup_branches.push_back({ b, instr.flow_control.dest_offset });
+}
- Compile_Block(instr.flow_control.dest_offset);
+void JitShader::Compile_Block(unsigned end) {
+ while (program_counter < end) {
+ Compile_NextInstr();
+ }
+}
+
+void JitShader::Compile_Return() {
+ // Peek return offset on the stack and check if we're at that offset
+ MOV(64, R(RAX), MDisp(RSP, 8));
+ CMP(32, R(RAX), Imm32(program_counter));
+ // If so, jump back to before CALL
+ FixupBranch b = J_CC(CC_NZ, true);
+ RET();
SetJumpTarget(b);
}
-void JitCompiler::Compile_Block(unsigned end) {
- // Save current offset pointer
- unsigned* prev_offset_ptr = offset_ptr;
- unsigned offset = *prev_offset_ptr;
+void JitShader::Compile_NextInstr() {
+ if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) {
+ Compile_Return();
+ }
- while (offset < end)
- Compile_NextInstr(&offset);
+ ASSERT_MSG(code_ptr[program_counter] == nullptr, "Tried to compile already compiled shader location!");
+ code_ptr[program_counter] = GetCodePtr();
- // Restore current offset pointer
- offset_ptr = prev_offset_ptr;
- *offset_ptr = offset;
-}
+ Instruction instr = GetVertexShaderInstruction(program_counter++);
-void JitCompiler::Compile_NextInstr(unsigned* offset) {
- offset_ptr = offset;
-
- Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++];
OpCode::Id opcode = instr.opcode.Value();
auto instr_func = instr_table[static_cast<unsigned>(opcode)];
@@ -755,9 +783,35 @@ void JitCompiler::Compile_NextInstr(unsigned* offset) {
}
}
-CompiledShader* JitCompiler::Compile() {
- const u8* start = GetCodePtr();
- unsigned offset = g_state.regs.vs.main_offset;
+void JitShader::FindReturnOffsets() {
+ return_offsets.clear();
+
+ for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) {
+ Instruction instr = GetVertexShaderInstruction(offset);
+
+ switch (instr.opcode.Value()) {
+ case OpCode::Id::CALL:
+ case OpCode::Id::CALLC:
+ case OpCode::Id::CALLU:
+ return_offsets.push_back(instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+ break;
+ }
+ }
+
+ // Sort for efficient binary search later
+ std::sort(return_offsets.begin(), return_offsets.end());
+}
+
+void JitShader::Compile() {
+ // Reset flow control state
+ program = (CompiledShader*)GetCodePtr();
+ program_counter = 0;
+ looping = false;
+ code_ptr.fill(nullptr);
+ fixup_branches.clear();
+
+ // Find all `CALL` instructions and identify return locations
+ FindReturnOffsets();
// The stack pointer is 8 modulo 16 at the entry of a procedure
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
@@ -780,21 +834,31 @@ CompiledShader* JitCompiler::Compile() {
MOV(PTRBITS, R(RAX), ImmPtr(&neg));
MOVAPS(NEGBIT, MatR(RAX));
- looping = false;
+ // Jump to start of the shader program
+ JMPptr(R(ABI_PARAM2));
+
+ // Compile entire program
+ Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size()));
- while (offset < g_state.vs.program_code.size()) {
- Compile_NextInstr(&offset);
+ // Set the target for any incomplete branches now that the entire shader program has been emitted
+ for (const auto& branch : fixup_branches) {
+ SetJumpTarget(branch.first, code_ptr[branch.second]);
}
- return (CompiledShader*)start;
-}
+ // Free memory that's no longer needed
+ return_offsets.clear();
+ return_offsets.shrink_to_fit();
+ fixup_branches.clear();
+ fixup_branches.shrink_to_fit();
+
+ uintptr_t size = reinterpret_cast<uintptr_t>(GetCodePtr()) - reinterpret_cast<uintptr_t>(program);
+ ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
-JitCompiler::JitCompiler() {
- AllocCodeSpace(jit_cache_size);
+ LOG_DEBUG(HW_GPU, "Compiled shader size=%d", size);
}
-void JitCompiler::Clear() {
- ClearCodeSpace();
+JitShader::JitShader() {
+ AllocCodeSpace(MAX_SHADER_SIZE);
}
} // namespace Shader
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index 5357c964b..cd6280ade 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -4,6 +4,9 @@
#pragma once
+#include <utility>
+#include <vector>
+
#include <nihstro/shader_bytecode.h>
#include "common/x64/emitter.h"
@@ -19,24 +22,22 @@ namespace Pica {
namespace Shader {
-/// Memory needed to be available to compile the next shader (otherwise, clear the cache)
-constexpr size_t jit_shader_size = 1024 * 512;
-/// Memory allocated for the JIT code space cache
-constexpr size_t jit_cache_size = 1024 * 1024 * 8;
-
-using CompiledShader = void(void* registers);
+/// Memory allocated for each compiled shader (64Kb)
+constexpr size_t MAX_SHADER_SIZE = 1024 * 64;
/**
* This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
* code that can be executed on the host machine directly.
*/
-class JitCompiler : public Gen::XCodeBlock {
+class JitShader : public Gen::XCodeBlock {
public:
- JitCompiler();
+ JitShader();
- CompiledShader* Compile();
+ void Run(void* registers, unsigned offset) const {
+ program(registers, code_ptr[offset]);
+ }
- void Clear();
+ void Compile();
void Compile_ADD(Instruction instr);
void Compile_DP3(Instruction instr);
@@ -66,8 +67,9 @@ public:
void Compile_MAD(Instruction instr);
private:
+
void Compile_Block(unsigned end);
- void Compile_NextInstr(unsigned* offset);
+ void Compile_NextInstr();
void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
@@ -81,13 +83,39 @@ private:
void Compile_EvaluateCondition(Instruction instr);
void Compile_UniformCondition(Instruction instr);
+ /**
+ * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction.
+ */
+ void Compile_Return();
+
BitSet32 PersistentCallerSavedRegs();
- /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
- unsigned* offset_ptr = nullptr;
+ /**
+ * Assertion evaluated at compile-time, but only triggered if executed at runtime.
+ * @param msg Message to be logged if the assertion fails.
+ */
+ void Compile_Assert(bool condition, const char* msg);
+
+ /**
+ * Analyzes the entire shader program for `CALL` instructions before emitting any code,
+ * identifying the locations where a return needs to be inserted.
+ */
+ void FindReturnOffsets();
+
+ /// Mapping of Pica VS instructions to pointers in the emitted code
+ std::array<const u8*, 1024> code_ptr;
+
+ /// Offsets in code where a return needs to be inserted
+ std::vector<unsigned> return_offsets;
+
+ unsigned program_counter = 0; ///< Offset of the next instruction to decode
+ bool looping = false; ///< True if compiling a loop, used to check for nested loops
+
+ /// Branches that need to be fixed up once the entire shader program is compiled
+ std::vector<std::pair<Gen::FixupBranch, unsigned>> fixup_branches;
- /// Set to true if currently in a loop, used to check for the existence of nested loops
- bool looping = false;
+ using CompiledShader = void(void* registers, const u8* start_addr);
+ CompiledShader* program = nullptr;
};
} // Shader