diff options
Diffstat (limited to '')
56 files changed, 1771 insertions, 1279 deletions
diff --git a/src/audio_core/audio_core.cpp b/src/audio_core/audio_core.cpp index ba6acf28e..84f9c03a7 100644 --- a/src/audio_core/audio_core.cpp +++ b/src/audio_core/audio_core.cpp @@ -56,20 +56,8 @@ void AddAddressSpace(Kernel::VMManager& address_space) { } void SelectSink(std::string sink_id) { - auto iter = - std::find_if(g_sink_details.begin(), g_sink_details.end(), - [sink_id](const auto& sink_detail) { return sink_detail.id == sink_id; }); - - if (sink_id == "auto" || iter == g_sink_details.end()) { - if (sink_id != "auto") { - LOG_ERROR(Audio, "AudioCore::SelectSink given invalid sink_id %s", sink_id.c_str()); - } - // Auto-select. - // g_sink_details is ordered in terms of desirability, with the best choice at the front. - iter = g_sink_details.begin(); - } - - DSP::HLE::SetSink(iter->factory()); + const SinkDetails& sink_details = GetSinkDetails(sink_id); + DSP::HLE::SetSink(sink_details.factory()); } void EnableStretching(bool enable) { diff --git a/src/audio_core/null_sink.h b/src/audio_core/null_sink.h index e7668438c..c732926a2 100644 --- a/src/audio_core/null_sink.h +++ b/src/audio_core/null_sink.h @@ -23,6 +23,12 @@ public: size_t SamplesInQueue() const override { return 0; } + + void SetDevice(int device_id) override {} + + std::vector<std::string> GetDeviceList() const override { + return {}; + } }; } // namespace AudioCore diff --git a/src/audio_core/sdl2_sink.cpp b/src/audio_core/sdl2_sink.cpp index 4b66cd826..933c5f16d 100644 --- a/src/audio_core/sdl2_sink.cpp +++ b/src/audio_core/sdl2_sink.cpp @@ -4,12 +4,12 @@ #include <list> #include <numeric> -#include <vector> #include <SDL.h> #include "audio_core/audio_core.h" #include "audio_core/sdl2_sink.h" #include "common/assert.h" #include "common/logging/log.h" +#include "core/settings.h" namespace AudioCore { @@ -42,10 +42,24 @@ SDL2Sink::SDL2Sink() : impl(std::make_unique<Impl>()) { SDL_AudioSpec obtained_audiospec; SDL_zero(obtained_audiospec); - impl->audio_device_id = - SDL_OpenAudioDevice(nullptr, false, &desired_audiospec, &obtained_audiospec, 0); + int device_count = SDL_GetNumAudioDevices(0); + device_list.clear(); + for (int i = 0; i < device_count; ++i) { + device_list.push_back(SDL_GetAudioDeviceName(i, 0)); + } + + const char* device = nullptr; + + if (device_count >= 1 && Settings::values.audio_device_id != "auto" && + !Settings::values.audio_device_id.empty()) { + device = Settings::values.audio_device_id.c_str(); + } + + impl->audio_device_id = SDL_OpenAudioDevice(device, false, &desired_audiospec, + &obtained_audiospec, SDL_AUDIO_ALLOW_ANY_CHANGE); if (impl->audio_device_id <= 0) { - LOG_CRITICAL(Audio_Sink, "SDL_OpenAudioDevice failed with: %s", SDL_GetError()); + LOG_CRITICAL(Audio_Sink, "SDL_OpenAudioDevice failed with code %d for device \"%s\"", + impl->audio_device_id, Settings::values.audio_device_id.c_str()); return; } @@ -69,6 +83,10 @@ unsigned int SDL2Sink::GetNativeSampleRate() const { return impl->sample_rate; } +std::vector<std::string> SDL2Sink::GetDeviceList() const { + return device_list; +} + void SDL2Sink::EnqueueSamples(const s16* samples, size_t sample_count) { if (impl->audio_device_id <= 0) return; @@ -96,6 +114,10 @@ size_t SDL2Sink::SamplesInQueue() const { return total_size; } +void SDL2Sink::SetDevice(int device_id) { + this->device_id = device_id; +} + void SDL2Sink::Impl::Callback(void* impl_, u8* buffer, int buffer_size_in_bytes) { Impl* impl = reinterpret_cast<Impl*>(impl_); diff --git a/src/audio_core/sdl2_sink.h b/src/audio_core/sdl2_sink.h index ccd0f7c7e..bcc725369 100644 --- a/src/audio_core/sdl2_sink.h +++ b/src/audio_core/sdl2_sink.h @@ -21,9 +21,14 @@ public: size_t SamplesInQueue() const override; + std::vector<std::string> GetDeviceList() const override; + void SetDevice(int device_id) override; + private: struct Impl; std::unique_ptr<Impl> impl; + int device_id; + std::vector<std::string> device_list; }; } // namespace AudioCore diff --git a/src/audio_core/sink.h b/src/audio_core/sink.h index 08f3bab5b..558c8c0fe 100644 --- a/src/audio_core/sink.h +++ b/src/audio_core/sink.h @@ -31,6 +31,15 @@ public: /// Samples enqueued that have not been played yet. virtual std::size_t SamplesInQueue() const = 0; + + /** + * Sets the desired output device. + * @paran device_id Id of the desired device. + */ + virtual void SetDevice(int device_id) = 0; + + /// Returns the list of available devices. + virtual std::vector<std::string> GetDeviceList() const = 0; }; } // namespace diff --git a/src/audio_core/sink_details.cpp b/src/audio_core/sink_details.cpp index 95ccc9e9d..6972395af 100644 --- a/src/audio_core/sink_details.cpp +++ b/src/audio_core/sink_details.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <algorithm> #include <memory> #include <vector> #include "audio_core/null_sink.h" @@ -9,6 +10,7 @@ #ifdef HAVE_SDL2 #include "audio_core/sdl2_sink.h" #endif +#include "common/logging/log.h" namespace AudioCore { @@ -20,4 +22,21 @@ const std::vector<SinkDetails> g_sink_details = { {"null", []() { return std::make_unique<NullSink>(); }}, }; +const SinkDetails& GetSinkDetails(std::string sink_id) { + auto iter = + std::find_if(g_sink_details.begin(), g_sink_details.end(), + [sink_id](const auto& sink_detail) { return sink_detail.id == sink_id; }); + + if (sink_id == "auto" || iter == g_sink_details.end()) { + if (sink_id != "auto") { + LOG_ERROR(Audio, "AudioCore::SelectSink given invalid sink_id %s", sink_id.c_str()); + } + // Auto-select. + // g_sink_details is ordered in terms of desirability, with the best choice at the front. + iter = g_sink_details.begin(); + } + + return *iter; +} + } // namespace AudioCore diff --git a/src/audio_core/sink_details.h b/src/audio_core/sink_details.h index 4b30cf835..9d3735171 100644 --- a/src/audio_core/sink_details.h +++ b/src/audio_core/sink_details.h @@ -24,4 +24,6 @@ struct SinkDetails { extern const std::vector<SinkDetails> g_sink_details; +const SinkDetails& GetSinkDetails(std::string sink_id); + } // namespace AudioCore diff --git a/src/citra/citra.cpp b/src/citra/citra.cpp index 99c096ac7..76f5caeb1 100644 --- a/src/citra/citra.cpp +++ b/src/citra/citra.cpp @@ -141,6 +141,26 @@ int main(int argc, char** argv) { case Core::System::ResultStatus::ErrorLoader: LOG_CRITICAL(Frontend, "Failed to load ROM!"); return -1; + case Core::System::ResultStatus::ErrorLoader_ErrorEncrypted: + LOG_CRITICAL(Frontend, "The game that you are trying to load must be decrypted before " + "being used with Citra. \n\n For more information on dumping and " + "decrypting games, please refer to: " + "https://citra-emu.org/wiki/Dumping-Game-Cartridges"); + return -1; + case Core::System::ResultStatus::ErrorLoader_ErrorInvalidFormat: + LOG_CRITICAL(Frontend, "Error while loading ROM: The ROM format is not supported."); + return -1; + case Core::System::ResultStatus::ErrorNotInitialized: + LOG_CRITICAL(Frontend, "CPUCore not initialized"); + return -1; + case Core::System::ResultStatus::ErrorSystemMode: + LOG_CRITICAL(Frontend, "Failed to determine system mode!"); + return -1; + case Core::System::ResultStatus::ErrorVideoCore: + LOG_CRITICAL(Frontend, "VideoCore not initialized"); + return -1; + case Core::System::ResultStatus::Success: + break; // Expected case } while (emu_window->IsOpen()) { diff --git a/src/citra/config.cpp b/src/citra/config.cpp index bd8ac563b..827c90e55 100644 --- a/src/citra/config.cpp +++ b/src/citra/config.cpp @@ -63,7 +63,8 @@ void Config::ReadValues() { // Renderer Settings::values.use_hw_renderer = sdl2_config->GetBoolean("Renderer", "use_hw_renderer", true); Settings::values.use_shader_jit = sdl2_config->GetBoolean("Renderer", "use_shader_jit", true); - Settings::values.resolution_factor = sdl2_config->GetReal("Renderer", "resolution_factor", 1.0); + Settings::values.resolution_factor = + (float)sdl2_config->GetReal("Renderer", "resolution_factor", 1.0); Settings::values.use_vsync = sdl2_config->GetBoolean("Renderer", "use_vsync", false); Settings::values.toggle_framelimit = sdl2_config->GetBoolean("Renderer", "toggle_framelimit", true); @@ -81,6 +82,7 @@ void Config::ReadValues() { Settings::values.sink_id = sdl2_config->Get("Audio", "output_engine", "auto"); Settings::values.enable_audio_stretching = sdl2_config->GetBoolean("Audio", "enable_audio_stretching", true); + Settings::values.audio_device_id = sdl2_config->Get("Audio", "output_device", "auto"); // Data Storage Settings::values.use_virtual_sd = diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h index 7996813b4..d728fb9e8 100644 --- a/src/citra/default_ini.h +++ b/src/citra/default_ini.h @@ -91,6 +91,10 @@ output_engine = # 0: No, 1 (default): Yes enable_audio_stretching = +# Which audio device to use. +# auto (default): Auto-select +output_device = + [Data Storage] # Whether to create a virtual SD card. # 1 (default): Yes, 0: No diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt index 93f1c339d..d4460bf01 100644 --- a/src/citra_qt/CMakeLists.txt +++ b/src/citra_qt/CMakeLists.txt @@ -14,7 +14,6 @@ set(SRCS debugger/graphics/graphics_tracing.cpp debugger/graphics/graphics_vertex_shader.cpp debugger/profiler.cpp - debugger/ramview.cpp debugger/registers.cpp debugger/wait_tree.cpp util/spinbox.cpp @@ -48,7 +47,6 @@ set(HEADERS debugger/graphics/graphics_tracing.h debugger/graphics/graphics_vertex_shader.h debugger/profiler.h - debugger/ramview.h debugger/registers.h debugger/wait_tree.h util/spinbox.h @@ -100,7 +98,7 @@ if (APPLE) else() add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS}) endif() -target_link_libraries(citra-qt core video_core audio_core common qhexedit) +target_link_libraries(citra-qt core video_core audio_core common) target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS}) target_link_libraries(citra-qt ${PLATFORM_LIBRARIES} Threads::Threads) diff --git a/src/citra_qt/config.cpp b/src/citra_qt/config.cpp index 8021667d0..f776e16b2 100644 --- a/src/citra_qt/config.cpp +++ b/src/citra_qt/config.cpp @@ -63,6 +63,8 @@ void Config::ReadValues() { Settings::values.sink_id = qt_config->value("output_engine", "auto").toString().toStdString(); Settings::values.enable_audio_stretching = qt_config->value("enable_audio_stretching", true).toBool(); + Settings::values.audio_device_id = + qt_config->value("output_device", "auto").toString().toStdString(); qt_config->endGroup(); qt_config->beginGroup("Data Storage"); @@ -169,6 +171,7 @@ void Config::SaveValues() { qt_config->beginGroup("Audio"); qt_config->setValue("output_engine", QString::fromStdString(Settings::values.sink_id)); qt_config->setValue("enable_audio_stretching", Settings::values.enable_audio_stretching); + qt_config->setValue("output_device", QString::fromStdString(Settings::values.audio_device_id)); qt_config->endGroup(); qt_config->beginGroup("Data Storage"); diff --git a/src/citra_qt/configure_audio.cpp b/src/citra_qt/configure_audio.cpp index 3cdd4c780..3ddcf9232 100644 --- a/src/citra_qt/configure_audio.cpp +++ b/src/citra_qt/configure_audio.cpp @@ -2,6 +2,9 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <memory> +#include "audio_core/audio_core.h" +#include "audio_core/sink.h" #include "audio_core/sink_details.h" #include "citra_qt/configure_audio.h" #include "core/settings.h" @@ -18,6 +21,8 @@ ConfigureAudio::ConfigureAudio(QWidget* parent) } this->setConfiguration(); + connect(ui->output_sink_combo_box, SIGNAL(currentIndexChanged(int)), this, + SLOT(updateAudioDevices(int))); } ConfigureAudio::~ConfigureAudio() {} @@ -33,6 +38,19 @@ void ConfigureAudio::setConfiguration() { ui->output_sink_combo_box->setCurrentIndex(new_sink_index); ui->toggle_audio_stretching->setChecked(Settings::values.enable_audio_stretching); + + // The device list cannot be pre-populated (nor listed) until the output sink is known. + updateAudioDevices(new_sink_index); + + int new_device_index = -1; + for (int index = 0; index < ui->audio_device_combo_box->count(); index++) { + if (ui->audio_device_combo_box->itemText(index).toStdString() == + Settings::values.audio_device_id) { + new_device_index = index; + break; + } + } + ui->audio_device_combo_box->setCurrentIndex(new_device_index); } void ConfigureAudio::applyConfiguration() { @@ -40,5 +58,20 @@ void ConfigureAudio::applyConfiguration() { ui->output_sink_combo_box->itemText(ui->output_sink_combo_box->currentIndex()) .toStdString(); Settings::values.enable_audio_stretching = ui->toggle_audio_stretching->isChecked(); + Settings::values.audio_device_id = + ui->audio_device_combo_box->itemText(ui->audio_device_combo_box->currentIndex()) + .toStdString(); Settings::Apply(); } + +void ConfigureAudio::updateAudioDevices(int sink_index) { + ui->audio_device_combo_box->clear(); + ui->audio_device_combo_box->addItem("auto"); + + std::string sink_id = ui->output_sink_combo_box->itemText(sink_index).toStdString(); + std::vector<std::string> device_list = + AudioCore::GetSinkDetails(sink_id).factory()->GetDeviceList(); + for (const auto& device : device_list) { + ui->audio_device_combo_box->addItem(device.c_str()); + } +} diff --git a/src/citra_qt/configure_audio.h b/src/citra_qt/configure_audio.h index 51df2e27b..8190e694f 100644 --- a/src/citra_qt/configure_audio.h +++ b/src/citra_qt/configure_audio.h @@ -20,6 +20,9 @@ public: void applyConfiguration(); +public slots: + void updateAudioDevices(int sink_index); + private: void setConfiguration(); diff --git a/src/citra_qt/configure_audio.ui b/src/citra_qt/configure_audio.ui index 3e2b4635f..dd870eb61 100644 --- a/src/citra_qt/configure_audio.ui +++ b/src/citra_qt/configure_audio.ui @@ -35,6 +35,21 @@ </property> </widget> </item> + <item> + <layout class="QHBoxLayout"> + <item> + <widget class="QLabel"> + <property name="text"> + <string>Audio Device:</string> + </property> + </widget> + </item> + <item> + <widget class="QComboBox" name="audio_device_combo_box"> + </widget> + </item> + </layout> + </item> </layout> </widget> </item> diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp index ff2e7e363..f37524190 100644 --- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp @@ -18,7 +18,9 @@ #include "citra_qt/util/util.h" #include "video_core/pica.h" #include "video_core/pica_state.h" +#include "video_core/shader/debug_data.h" #include "video_core/shader/shader.h" +#include "video_core/shader/shader_interpreter.h" using nihstro::OpCode; using nihstro::Instruction; @@ -518,8 +520,9 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d info.labels.insert({entry_point, "main"}); // Generate debug information - debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config, - shader_setup); + Pica::Shader::InterpreterEngine shader_engine; + shader_engine.SetupBatch(shader_setup, entry_point); + debug_data = shader_engine.ProduceDebugInfo(shader_setup, input_vertex, num_attributes); // Reload widget state for (int attr = 0; attr < num_attributes; ++attr) { diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h index bedea0bed..3292573f3 100644 --- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h +++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h @@ -8,6 +8,7 @@ #include <QTreeView> #include "citra_qt/debugger/graphics/graphics_breakpoint_observer.h" #include "nihstro/parser_shbin.h" +#include "video_core/shader/debug_data.h" #include "video_core/shader/shader.h" class QLabel; diff --git a/src/citra_qt/debugger/ramview.cpp b/src/citra_qt/debugger/ramview.cpp deleted file mode 100644 index 10a09dda8..000000000 --- a/src/citra_qt/debugger/ramview.cpp +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "citra_qt/debugger/ramview.h" - -GRamView::GRamView(QWidget* parent) : QHexEdit(parent) {} - -void GRamView::OnCPUStepped() { - // TODO: QHexEdit doesn't show vertical scroll bars for > 10MB data streams... - // setData(QByteArray((const char*)Mem_RAM,sizeof(Mem_RAM)/8)); -} diff --git a/src/citra_qt/debugger/ramview.h b/src/citra_qt/debugger/ramview.h deleted file mode 100644 index d01cea93b..000000000 --- a/src/citra_qt/debugger/ramview.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "qhexedit.h" - -class GRamView : public QHexEdit { - Q_OBJECT - -public: - explicit GRamView(QWidget* parent = nullptr); - -public slots: - void OnCPUStepped(); -}; diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp index 6d59cf640..f765c0147 100644 --- a/src/citra_qt/main.cpp +++ b/src/citra_qt/main.cpp @@ -12,6 +12,7 @@ #include <QFileDialog> #include <QMessageBox> #include <QtGui> +#include <QtWidgets> #include "citra_qt/bootmanager.h" #include "citra_qt/config.h" #include "citra_qt/configure_dialog.h" @@ -24,7 +25,6 @@ #include "citra_qt/debugger/graphics/graphics_tracing.h" #include "citra_qt/debugger/graphics/graphics_vertex_shader.h" #include "citra_qt/debugger/profiler.h" -#include "citra_qt/debugger/ramview.h" #include "citra_qt/debugger/registers.h" #include "citra_qt/debugger/wait_tree.h" #include "citra_qt/game_list.h" @@ -46,7 +46,6 @@ #include "core/gdbstub/gdbstub.h" #include "core/loader/loader.h" #include "core/settings.h" -#include "qhexedit.h" #include "video_core/video_core.h" #ifdef QT_STATICPLUGIN diff --git a/src/common/hash.cpp b/src/common/hash.cpp index 2309320bb..f3d390dc5 100644 --- a/src/common/hash.cpp +++ b/src/common/hash.cpp @@ -16,7 +16,7 @@ namespace Common { // Block read - if your platform needs to do endian-swapping or can only handle aligned reads, do // the conversion here -static FORCE_INLINE u64 getblock64(const u64* p, int i) { +static FORCE_INLINE u64 getblock64(const u64* p, size_t i) { return p[i]; } @@ -34,9 +34,9 @@ static FORCE_INLINE u64 fmix64(u64 k) { // This is the 128-bit variant of the MurmurHash3 hash function that is targeted for 64-bit // platforms (MurmurHash3_x64_128). It was taken from: // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp -void MurmurHash3_128(const void* key, int len, u32 seed, void* out) { +void MurmurHash3_128(const void* key, size_t len, u32 seed, void* out) { const u8* data = (const u8*)key; - const int nblocks = len / 16; + const size_t nblocks = len / 16; u64 h1 = seed; u64 h2 = seed; @@ -48,7 +48,7 @@ void MurmurHash3_128(const void* key, int len, u32 seed, void* out) { const u64* blocks = (const u64*)(data); - for (int i = 0; i < nblocks; i++) { + for (size_t i = 0; i < nblocks; i++) { u64 k1 = getblock64(blocks, i * 2 + 0); u64 k2 = getblock64(blocks, i * 2 + 1); diff --git a/src/common/hash.h b/src/common/hash.h index a3850be68..ee2560dad 100644 --- a/src/common/hash.h +++ b/src/common/hash.h @@ -4,11 +4,12 @@ #pragma once +#include <cstddef> #include "common/common_types.h" namespace Common { -void MurmurHash3_128(const void* key, int len, u32 seed, void* out); +void MurmurHash3_128(const void* key, size_t len, u32 seed, void* out); /** * Computes a 64-bit hash over the specified block of data @@ -16,7 +17,7 @@ void MurmurHash3_128(const void* key, int len, u32 seed, void* out); * @param len Length of data (in bytes) to compute hash over * @returns 64-bit hash value that was computed over the data block */ -static inline u64 ComputeHash64(const void* data, int len) { +static inline u64 ComputeHash64(const void* data, size_t len) { u64 res[2]; MurmurHash3_128(data, len, 0, res); return res[0]; diff --git a/src/core/core.h b/src/core/core.h index 1015e8847..17572a74f 100644 --- a/src/core/core.h +++ b/src/core/core.h @@ -115,7 +115,7 @@ private: static System s_instance; }; -static ARM_Interface& CPU() { +inline ARM_Interface& CPU() { return System::GetInstance().CPU(); } diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index a437d0823..276ecfdf6 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -13,7 +13,7 @@ #include "core/core.h" #include "core/core_timing.h" -int g_clock_rate_arm11 = 268123480; +int g_clock_rate_arm11 = BASE_CLOCK_RATE_ARM11; // is this really necessary? #define INITIAL_SLICE_LENGTH 20000 diff --git a/src/core/core_timing.h b/src/core/core_timing.h index b72a1b500..d2f85cd4d 100644 --- a/src/core/core_timing.h +++ b/src/core/core_timing.h @@ -21,6 +21,7 @@ // inside callback: // ScheduleEvent(periodInCycles - cycles_late, callback, "whatever") +constexpr int BASE_CLOCK_RATE_ARM11 = 268123480; extern int g_clock_rate_arm11; inline s64 msToCycles(int ms) { diff --git a/src/core/file_sys/archive_extsavedata.cpp b/src/core/file_sys/archive_extsavedata.cpp index 51ce78435..dd2fb167f 100644 --- a/src/core/file_sys/archive_extsavedata.cpp +++ b/src/core/file_sys/archive_extsavedata.cpp @@ -107,6 +107,8 @@ public: case PathParser::NotFound: LOG_ERROR(Service_FS, "%s not found", full_path.c_str()); return ERROR_FILE_NOT_FOUND; + case PathParser::FileFound: + break; // Expected 'success' case } FileUtil::IOFile file(full_path, "r+b"); diff --git a/src/core/file_sys/archive_sdmc.cpp b/src/core/file_sys/archive_sdmc.cpp index 333dfb92e..72ff05c65 100644 --- a/src/core/file_sys/archive_sdmc.cpp +++ b/src/core/file_sys/archive_sdmc.cpp @@ -72,6 +72,8 @@ ResultVal<std::unique_ptr<FileBackend>> SDMCArchive::OpenFileBase(const Path& pa FileUtil::CreateEmptyFile(full_path); } break; + case PathParser::FileFound: + break; // Expected 'success' case } FileUtil::IOFile file(full_path, mode.write_flag ? "r+b" : "rb"); @@ -106,6 +108,8 @@ ResultCode SDMCArchive::DeleteFile(const Path& path) const { case PathParser::DirectoryFound: LOG_ERROR(Service_FS, "%s is not a file", full_path.c_str()); return ERROR_UNEXPECTED_FILE_OR_DIRECTORY_SDMC; + case PathParser::FileFound: + break; // Expected 'success' case } if (FileUtil::Delete(full_path)) { @@ -154,6 +158,8 @@ static ResultCode DeleteDirectoryHelper(const Path& path, const std::string& mou case PathParser::FileFound: LOG_ERROR(Service_FS, "Unexpected file in path %s", full_path.c_str()); return ERROR_UNEXPECTED_FILE_OR_DIRECTORY_SDMC; + case PathParser::DirectoryFound: + break; // Expected 'success' case } if (deleter(full_path)) { @@ -197,6 +203,8 @@ ResultCode SDMCArchive::CreateFile(const FileSys::Path& path, u64 size) const { case PathParser::FileFound: LOG_ERROR(Service_FS, "%s already exists", full_path.c_str()); return ERROR_ALREADY_EXISTS; + case PathParser::NotFound: + break; // Expected 'success' case } if (size == 0) { @@ -238,6 +246,8 @@ ResultCode SDMCArchive::CreateDirectory(const Path& path) const { case PathParser::FileFound: LOG_ERROR(Service_FS, "%s already exists", full_path.c_str()); return ERROR_ALREADY_EXISTS; + case PathParser::NotFound: + break; // Expected 'success' case } if (FileUtil::CreateDir(mount_point + path.AsString())) { @@ -281,6 +291,8 @@ ResultVal<std::unique_ptr<DirectoryBackend>> SDMCArchive::OpenDirectory(const Pa case PathParser::FileInPath: LOG_ERROR(Service_FS, "Unexpected file in path %s", full_path.c_str()); return ERROR_UNEXPECTED_FILE_OR_DIRECTORY_SDMC; + case PathParser::DirectoryFound: + break; // Expected 'success' case } auto directory = std::make_unique<DiskDirectory>(full_path); diff --git a/src/core/file_sys/savedata_archive.cpp b/src/core/file_sys/savedata_archive.cpp index f2e6a06bc..f540c4a93 100644 --- a/src/core/file_sys/savedata_archive.cpp +++ b/src/core/file_sys/savedata_archive.cpp @@ -57,6 +57,8 @@ ResultVal<std::unique_ptr<FileBackend>> SaveDataArchive::OpenFile(const Path& pa FileUtil::CreateEmptyFile(full_path); } break; + case PathParser::FileFound: + break; // Expected 'success' case } FileUtil::IOFile file(full_path, mode.write_flag ? "r+b" : "rb"); @@ -91,6 +93,8 @@ ResultCode SaveDataArchive::DeleteFile(const Path& path) const { case PathParser::NotFound: LOG_ERROR(Service_FS, "File not found %s", full_path.c_str()); return ERROR_FILE_NOT_FOUND; + case PathParser::FileFound: + break; // Expected 'success' case } if (FileUtil::Delete(full_path)) { @@ -139,6 +143,8 @@ static ResultCode DeleteDirectoryHelper(const Path& path, const std::string& mou case PathParser::FileFound: LOG_ERROR(Service_FS, "Unexpected file or directory %s", full_path.c_str()); return ERROR_UNEXPECTED_FILE_OR_DIRECTORY; + case PathParser::DirectoryFound: + break; // Expected 'success' case } if (deleter(full_path)) { @@ -182,6 +188,8 @@ ResultCode SaveDataArchive::CreateFile(const FileSys::Path& path, u64 size) cons case PathParser::FileFound: LOG_ERROR(Service_FS, "%s already exists", full_path.c_str()); return ERROR_FILE_ALREADY_EXISTS; + case PathParser::NotFound: + break; // Expected 'success' case } if (size == 0) { @@ -225,6 +233,8 @@ ResultCode SaveDataArchive::CreateDirectory(const Path& path) const { case PathParser::FileFound: LOG_ERROR(Service_FS, "%s already exists", full_path.c_str()); return ERROR_DIRECTORY_ALREADY_EXISTS; + case PathParser::NotFound: + break; // Expected 'success' case } if (FileUtil::CreateDir(mount_point + path.AsString())) { @@ -269,6 +279,8 @@ ResultVal<std::unique_ptr<DirectoryBackend>> SaveDataArchive::OpenDirectory( case PathParser::FileFound: LOG_ERROR(Service_FS, "Unexpected file in path %s", full_path.c_str()); return ERROR_UNEXPECTED_FILE_OR_DIRECTORY; + case PathParser::DirectoryFound: + break; // Expected 'success' case } auto directory = std::make_unique<DiskDirectory>(full_path); diff --git a/src/core/frontend/emu_window.cpp b/src/core/frontend/emu_window.cpp index 1541cc39d..4f0f786ce 100644 --- a/src/core/frontend/emu_window.cpp +++ b/src/core/frontend/emu_window.cpp @@ -98,9 +98,9 @@ void EmuWindow::AccelerometerChanged(float x, float y, float z) { // TODO(wwylele): do a time stretch as it in GyroscopeChanged // The time stretch formula should be like // stretched_vector = (raw_vector - gravity) * stretch_ratio + gravity - accel_x = x * coef; - accel_y = y * coef; - accel_z = z * coef; + accel_x = static_cast<s16>(x * coef); + accel_y = static_cast<s16>(y * coef); + accel_z = static_cast<s16>(z * coef); } void EmuWindow::GyroscopeChanged(float x, float y, float z) { @@ -109,9 +109,9 @@ void EmuWindow::GyroscopeChanged(float x, float y, float z) { float stretch = FULL_FPS / Common::Profiling::GetTimingResultsAggregator()->GetAggregatedResults().fps; std::lock_guard<std::mutex> lock(gyro_mutex); - gyro_x = x * coef * stretch; - gyro_y = y * coef * stretch; - gyro_z = z * coef * stretch; + gyro_x = static_cast<s16>(x * coef * stretch); + gyro_y = static_cast<s16>(y * coef * stretch); + gyro_z = static_cast<s16>(z * coef * stretch); } void EmuWindow::UpdateCurrentFramebufferLayout(unsigned width, unsigned height) { diff --git a/src/core/hle/service/err_f.cpp b/src/core/hle/service/err_f.cpp index cd0a1a598..9da55f328 100644 --- a/src/core/hle/service/err_f.cpp +++ b/src/core/hle/service/err_f.cpp @@ -227,6 +227,8 @@ static void ThrowFatalError(Interface* self) { LOG_CRITICAL(Service_ERR, "FINST2: 0x%08X", errtype.exception_data.exception_info.fpinst2); break; + case ExceptionType::Undefined: + break; // Not logging exception_info for this case } LOG_CRITICAL(Service_ERR, "Datetime: %s", GetCurrentSystemTime().c_str()); break; diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp index 947958703..a8c1331ed 100644 --- a/src/core/hle/service/gsp_gpu.cpp +++ b/src/core/hle/service/gsp_gpu.cpp @@ -149,7 +149,7 @@ static ResultCode WriteHWRegsWithMask(u32 base_address, u32 size_in_bytes, VAddr u32 mask = Memory::Read32(masks_vaddr); // Update the current value of the register only for set mask bits - reg_value = (reg_value & ~mask) | (data | mask); + reg_value = (reg_value & ~mask) | (data & mask); WriteSingleHWReg(base_address, reg_value); diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp index 676154bd4..f14ab3811 100644 --- a/src/core/hle/service/hid/hid.cpp +++ b/src/core/hle/service/hid/hid.cpp @@ -35,6 +35,15 @@ static u32 next_gyroscope_index; static int enable_accelerometer_count = 0; // positive means enabled static int enable_gyroscope_count = 0; // positive means enabled +static int pad_update_event; +static int accelerometer_update_event; +static int gyroscope_update_event; + +// Updating period for each HID device. These empirical values are measured from a 11.2 3DS. +constexpr u64 pad_update_ticks = BASE_CLOCK_RATE_ARM11 / 234; +constexpr u64 accelerometer_update_ticks = BASE_CLOCK_RATE_ARM11 / 104; +constexpr u64 gyroscope_update_ticks = BASE_CLOCK_RATE_ARM11 / 101; + static PadState GetCirclePadDirectionState(s16 circle_pad_x, s16 circle_pad_y) { // 30 degree and 60 degree are angular thresholds for directions constexpr float TAN30 = 0.577350269f; @@ -65,14 +74,9 @@ static PadState GetCirclePadDirectionState(s16 circle_pad_x, s16 circle_pad_y) { return state; } -void Update() { +static void UpdatePadCallback(u64 userdata, int cycles_late) { SharedMem* mem = reinterpret_cast<SharedMem*>(shared_mem->GetPointer()); - if (mem == nullptr) { - LOG_DEBUG(Service_HID, "Cannot update HID prior to mapping shared memory!"); - return; - } - PadState state = VideoCore::g_emu_window->GetPadState(); // Get current circle pad position and update circle pad direction @@ -131,59 +135,68 @@ void Update() { event_pad_or_touch_1->Signal(); event_pad_or_touch_2->Signal(); - // Update accelerometer - if (enable_accelerometer_count > 0) { - mem->accelerometer.index = next_accelerometer_index; - next_accelerometer_index = - (next_accelerometer_index + 1) % mem->accelerometer.entries.size(); - - AccelerometerDataEntry& accelerometer_entry = - mem->accelerometer.entries[mem->accelerometer.index]; - std::tie(accelerometer_entry.x, accelerometer_entry.y, accelerometer_entry.z) = - VideoCore::g_emu_window->GetAccelerometerState(); - - // Make up "raw" entry - // TODO(wwylele): - // From hardware testing, the raw_entry values are approximately, - // but not exactly, as twice as corresponding entries (or with a minus sign). - // It may caused by system calibration to the accelerometer. - // Figure out how it works, or, if no game reads raw_entry, - // the following three lines can be removed and leave raw_entry unimplemented. - mem->accelerometer.raw_entry.x = -2 * accelerometer_entry.x; - mem->accelerometer.raw_entry.z = 2 * accelerometer_entry.y; - mem->accelerometer.raw_entry.y = -2 * accelerometer_entry.z; - - // If we just updated index 0, provide a new timestamp - if (mem->accelerometer.index == 0) { - mem->accelerometer.index_reset_ticks_previous = mem->accelerometer.index_reset_ticks; - mem->accelerometer.index_reset_ticks = (s64)CoreTiming::GetTicks(); - } + // Reschedule recurrent event + CoreTiming::ScheduleEvent(pad_update_ticks - cycles_late, pad_update_event); +} + +static void UpdateAccelerometerCallback(u64 userdata, int cycles_late) { + SharedMem* mem = reinterpret_cast<SharedMem*>(shared_mem->GetPointer()); + + mem->accelerometer.index = next_accelerometer_index; + next_accelerometer_index = (next_accelerometer_index + 1) % mem->accelerometer.entries.size(); - event_accelerometer->Signal(); + AccelerometerDataEntry& accelerometer_entry = + mem->accelerometer.entries[mem->accelerometer.index]; + std::tie(accelerometer_entry.x, accelerometer_entry.y, accelerometer_entry.z) = + VideoCore::g_emu_window->GetAccelerometerState(); + + // Make up "raw" entry + // TODO(wwylele): + // From hardware testing, the raw_entry values are approximately, but not exactly, as twice as + // corresponding entries (or with a minus sign). It may caused by system calibration to the + // accelerometer. Figure out how it works, or, if no game reads raw_entry, the following three + // lines can be removed and leave raw_entry unimplemented. + mem->accelerometer.raw_entry.x = -2 * accelerometer_entry.x; + mem->accelerometer.raw_entry.z = 2 * accelerometer_entry.y; + mem->accelerometer.raw_entry.y = -2 * accelerometer_entry.z; + + // If we just updated index 0, provide a new timestamp + if (mem->accelerometer.index == 0) { + mem->accelerometer.index_reset_ticks_previous = mem->accelerometer.index_reset_ticks; + mem->accelerometer.index_reset_ticks = (s64)CoreTiming::GetTicks(); } - // Update gyroscope - if (enable_gyroscope_count > 0) { - mem->gyroscope.index = next_gyroscope_index; - next_gyroscope_index = (next_gyroscope_index + 1) % mem->gyroscope.entries.size(); + event_accelerometer->Signal(); - GyroscopeDataEntry& gyroscope_entry = mem->gyroscope.entries[mem->gyroscope.index]; - std::tie(gyroscope_entry.x, gyroscope_entry.y, gyroscope_entry.z) = - VideoCore::g_emu_window->GetGyroscopeState(); + // Reschedule recurrent event + CoreTiming::ScheduleEvent(accelerometer_update_ticks - cycles_late, accelerometer_update_event); +} - // Make up "raw" entry - mem->gyroscope.raw_entry.x = gyroscope_entry.x; - mem->gyroscope.raw_entry.z = -gyroscope_entry.y; - mem->gyroscope.raw_entry.y = gyroscope_entry.z; +static void UpdateGyroscopeCallback(u64 userdata, int cycles_late) { + SharedMem* mem = reinterpret_cast<SharedMem*>(shared_mem->GetPointer()); - // If we just updated index 0, provide a new timestamp - if (mem->gyroscope.index == 0) { - mem->gyroscope.index_reset_ticks_previous = mem->gyroscope.index_reset_ticks; - mem->gyroscope.index_reset_ticks = (s64)CoreTiming::GetTicks(); - } + mem->gyroscope.index = next_gyroscope_index; + next_gyroscope_index = (next_gyroscope_index + 1) % mem->gyroscope.entries.size(); + + GyroscopeDataEntry& gyroscope_entry = mem->gyroscope.entries[mem->gyroscope.index]; + std::tie(gyroscope_entry.x, gyroscope_entry.y, gyroscope_entry.z) = + VideoCore::g_emu_window->GetGyroscopeState(); + + // Make up "raw" entry + mem->gyroscope.raw_entry.x = gyroscope_entry.x; + mem->gyroscope.raw_entry.z = -gyroscope_entry.y; + mem->gyroscope.raw_entry.y = gyroscope_entry.z; - event_gyroscope->Signal(); + // If we just updated index 0, provide a new timestamp + if (mem->gyroscope.index == 0) { + mem->gyroscope.index_reset_ticks_previous = mem->gyroscope.index_reset_ticks; + mem->gyroscope.index_reset_ticks = (s64)CoreTiming::GetTicks(); } + + event_gyroscope->Signal(); + + // Reschedule recurrent event + CoreTiming::ScheduleEvent(gyroscope_update_ticks - cycles_late, gyroscope_update_event); } void GetIPCHandles(Service::Interface* self) { @@ -204,7 +217,11 @@ void EnableAccelerometer(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); ++enable_accelerometer_count; - event_accelerometer->Signal(); + + // Schedules the accelerometer update event if the accelerometer was just enabled + if (enable_accelerometer_count == 1) { + CoreTiming::ScheduleEvent(accelerometer_update_ticks, accelerometer_update_event); + } cmd_buff[1] = RESULT_SUCCESS.raw; @@ -215,7 +232,11 @@ void DisableAccelerometer(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); --enable_accelerometer_count; - event_accelerometer->Signal(); + + // Unschedules the accelerometer update event if the accelerometer was just disabled + if (enable_accelerometer_count == 0) { + CoreTiming::UnscheduleEvent(accelerometer_update_event, 0); + } cmd_buff[1] = RESULT_SUCCESS.raw; @@ -226,7 +247,11 @@ void EnableGyroscopeLow(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); ++enable_gyroscope_count; - event_gyroscope->Signal(); + + // Schedules the gyroscope update event if the gyroscope was just enabled + if (enable_gyroscope_count == 1) { + CoreTiming::ScheduleEvent(gyroscope_update_ticks, gyroscope_update_event); + } cmd_buff[1] = RESULT_SUCCESS.raw; @@ -237,7 +262,11 @@ void DisableGyroscopeLow(Service::Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); --enable_gyroscope_count; - event_gyroscope->Signal(); + + // Unschedules the gyroscope update event if the gyroscope was just disabled + if (enable_gyroscope_count == 0) { + CoreTiming::UnscheduleEvent(gyroscope_update_event, 0); + } cmd_buff[1] = RESULT_SUCCESS.raw; @@ -291,6 +320,8 @@ void Init() { next_pad_index = 0; next_touch_index = 0; + next_accelerometer_index = 0; + next_gyroscope_index = 0; // Create event handles event_pad_or_touch_1 = Event::Create(ResetType::OneShot, "HID:EventPadOrTouch1"); @@ -298,6 +329,15 @@ void Init() { event_accelerometer = Event::Create(ResetType::OneShot, "HID:EventAccelerometer"); event_gyroscope = Event::Create(ResetType::OneShot, "HID:EventGyroscope"); event_debug_pad = Event::Create(ResetType::OneShot, "HID:EventDebugPad"); + + // Register update callbacks + pad_update_event = CoreTiming::RegisterEvent("HID::UpdatePadCallback", UpdatePadCallback); + accelerometer_update_event = + CoreTiming::RegisterEvent("HID::UpdateAccelerometerCallback", UpdateAccelerometerCallback); + gyroscope_update_event = + CoreTiming::RegisterEvent("HID::UpdateGyroscopeCallback", UpdateGyroscopeCallback); + + CoreTiming::ScheduleEvent(pad_update_ticks, pad_update_event); } void Shutdown() { diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h index 7904e7355..21e66dfe0 100644 --- a/src/core/hle/service/hid/hid.h +++ b/src/core/hle/service/hid/hid.h @@ -296,9 +296,6 @@ void GetGyroscopeLowRawToDpsCoefficient(Service::Interface* self); */ void GetGyroscopeLowCalibrateParam(Service::Interface* self); -/// Checks for user input updates -void Update(); - /// Initialize HID service void Init(); diff --git a/src/core/hle/service/mic_u.cpp b/src/core/hle/service/mic_u.cpp index c62f8afc6..e98388560 100644 --- a/src/core/hle/service/mic_u.cpp +++ b/src/core/hle/service/mic_u.cpp @@ -93,7 +93,7 @@ static void StartSampling(Interface* self) { sample_rate = static_cast<SampleRate>(cmd_buff[2] & 0xFF); audio_buffer_offset = cmd_buff[3]; audio_buffer_size = cmd_buff[4]; - audio_buffer_loop = static_cast<bool>(cmd_buff[5] & 0xFF); + audio_buffer_loop = (cmd_buff[5] & 0xFF) != 0; cmd_buff[1] = RESULT_SUCCESS.raw; // No error is_sampling = true; @@ -202,7 +202,7 @@ static void GetGain(Interface* self) { */ static void SetPower(Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - mic_power = static_cast<bool>(cmd_buff[1] & 0xFF); + mic_power = (cmd_buff[1] & 0xFF) != 0; cmd_buff[1] = RESULT_SUCCESS.raw; // No error LOG_WARNING(Service_MIC, "(STUBBED) called, mic_power=%u", mic_power); } @@ -252,7 +252,7 @@ static void SetIirFilterMic(Interface* self) { */ static void SetClamp(Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - clamp = static_cast<bool>(cmd_buff[1] & 0xFF); + clamp = (cmd_buff[1] & 0xFF) != 0; cmd_buff[1] = RESULT_SUCCESS.raw; // No error LOG_WARNING(Service_MIC, "(STUBBED) called, clamp=%u", clamp); } @@ -282,7 +282,7 @@ static void GetClamp(Interface* self) { */ static void SetAllowShellClosed(Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); - allow_shell_closed = static_cast<bool>(cmd_buff[1] & 0xFF); + allow_shell_closed = (cmd_buff[1] & 0xFF) != 0; cmd_buff[1] = RESULT_SUCCESS.raw; // No error LOG_WARNING(Service_MIC, "(STUBBED) called, allow_shell_closed=%u", allow_shell_closed); } diff --git a/src/core/hle/service/nfc/nfc.cpp b/src/core/hle/service/nfc/nfc.cpp index e248285f9..fd3c7d9c2 100644 --- a/src/core/hle/service/nfc/nfc.cpp +++ b/src/core/hle/service/nfc/nfc.cpp @@ -11,6 +11,81 @@ namespace Service { namespace NFC { static Kernel::SharedPtr<Kernel::Event> tag_in_range_event; +static Kernel::SharedPtr<Kernel::Event> tag_out_of_range_event; +static TagState nfc_tag_state = TagState::NotInitialized; +static CommunicationStatus nfc_status = CommunicationStatus::NfcInitialized; + +void Initialize(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + u8 param = static_cast<u8>(cmd_buff[1] & 0xFF); + + nfc_tag_state = TagState::NotScanning; + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + LOG_WARNING(Service_NFC, "(STUBBED) called, param=%u", param); +} + +void Shutdown(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + u8 param = static_cast<u8>(cmd_buff[1] & 0xFF); + nfc_tag_state = TagState::NotInitialized; + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + LOG_WARNING(Service_NFC, "(STUBBED) called, param=%u", param); +} + +void StartCommunication(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + LOG_WARNING(Service_NFC, "(STUBBED) called"); +} + +void StopCommunication(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + LOG_WARNING(Service_NFC, "(STUBBED) called"); +} + +void StartTagScanning(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + nfc_tag_state = TagState::TagInRange; + tag_in_range_event->Signal(); + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + LOG_WARNING(Service_NFC, "(STUBBED) called"); +} + +void StopTagScanning(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + nfc_tag_state = TagState::NotScanning; + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + LOG_WARNING(Service_NFC, "(STUBBED) called"); +} + +void LoadAmiiboData(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + nfc_tag_state = TagState::TagDataLoaded; + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + LOG_WARNING(Service_NFC, "(STUBBED) called"); +} + +void ResetTagScanState(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + nfc_tag_state = TagState::NotScanning; + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + LOG_WARNING(Service_NFC, "(STUBBED) called"); +} void GetTagInRangeEvent(Interface* self) { u32* cmd_buff = Kernel::GetCommandBuffer(); @@ -22,16 +97,46 @@ void GetTagInRangeEvent(Interface* self) { LOG_WARNING(Service_NFC, "(STUBBED) called"); } +void GetTagOutOfRangeEvent(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + cmd_buff[0] = IPC::MakeHeader(0xC, 1, 2); + cmd_buff[1] = RESULT_SUCCESS.raw; + cmd_buff[2] = IPC::CopyHandleDesc(); + cmd_buff[3] = Kernel::g_handle_table.Create(tag_out_of_range_event).MoveFrom(); + LOG_WARNING(Service_NFC, "(STUBBED) called"); +} + +void GetTagState(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + cmd_buff[2] = static_cast<u8>(nfc_tag_state); + LOG_DEBUG(Service_NFC, "(STUBBED) called"); +} + +void CommunicationGetStatus(Interface* self) { + u32* cmd_buff = Kernel::GetCommandBuffer(); + + cmd_buff[1] = RESULT_SUCCESS.raw; // No error + cmd_buff[2] = static_cast<u8>(nfc_status); + LOG_DEBUG(Service_NFC, "(STUBBED) called"); +} + void Init() { AddService(new NFC_M()); AddService(new NFC_U()); tag_in_range_event = Kernel::Event::Create(Kernel::ResetType::OneShot, "NFC::tag_in_range_event"); + tag_out_of_range_event = + Kernel::Event::Create(Kernel::ResetType::OneShot, "NFC::tag_out_range_event"); + nfc_tag_state = TagState::NotInitialized; } void Shutdown() { tag_in_range_event = nullptr; + tag_out_of_range_event = nullptr; } } // namespace NFC diff --git a/src/core/hle/service/nfc/nfc.h b/src/core/hle/service/nfc/nfc.h index b02354201..a013bdae7 100644 --- a/src/core/hle/service/nfc/nfc.h +++ b/src/core/hle/service/nfc/nfc.h @@ -4,12 +4,103 @@ #pragma once +#include "common/common_types.h" + namespace Service { class Interface; namespace NFC { +enum class TagState : u8 { + NotInitialized = 0, + NotScanning = 1, + Scanning = 2, + TagInRange = 3, + TagOutOfRange = 4, + TagDataLoaded = 5, +}; + +enum class CommunicationStatus : u8 { + AttemptInitialize = 1, + NfcInitialized = 2, +}; + +/** + * NFC::Initialize service function + * Inputs: + * 0 : Header code [0x00010040] + * 1 : (u8) unknown parameter. Can be either value 0x1 or 0x2 + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + */ +void Initialize(Interface* self); + +/** + * NFC::Shutdown service function + * Inputs: + * 0 : Header code [0x00020040] + * 1 : (u8) unknown parameter + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + */ +void Shutdown(Interface* self); + +/** + * NFC::StartCommunication service function + * Inputs: + * 0 : Header code [0x00030000] + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + */ +void StartCommunication(Interface* self); + +/** + * NFC::StopCommunication service function + * Inputs: + * 0 : Header code [0x00040000] + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + */ +void StopCommunication(Interface* self); + +/** + * NFC::StartTagScanning service function + * Inputs: + * 0 : Header code [0x00050040] + * 1 : (u16) unknown. This is normally 0x0 + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + */ +void StartTagScanning(Interface* self); + +/** + * NFC::StopTagScanning service function + * Inputs: + * 0 : Header code [0x00060000] + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + */ +void StopTagScanning(Interface* self); + +/** + * NFC::LoadAmiiboData service function + * Inputs: + * 0 : Header code [0x00070000] + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + */ +void LoadAmiiboData(Interface* self); + +/** + * NFC::ResetTagScanState service function + * Inputs: + * 0 : Header code [0x00080000] + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + */ +void ResetTagScanState(Interface* self); + /** * NFC::GetTagInRangeEvent service function * Inputs: @@ -21,6 +112,37 @@ namespace NFC { */ void GetTagInRangeEvent(Interface* self); +/** + * NFC::GetTagOutOfRangeEvent service function + * Inputs: + * 0 : Header code [0x000C0000] + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + * 2 : Copy handle descriptor + * 3 : Event Handle + */ +void GetTagOutOfRangeEvent(Interface* self); + +/** + * NFC::GetTagState service function + * Inputs: + * 0 : Header code [0x000D0000] + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + * 2 : (u8) Tag state + */ +void GetTagState(Interface* self); + +/** + * NFC::CommunicationGetStatus service function + * Inputs: + * 0 : Header code [0x000F0000] + * Outputs: + * 1 : Result of function, 0 on success, otherwise error code + * 2 : (u8) Communication state + */ +void CommunicationGetStatus(Interface* self); + /// Initialize all NFC services. void Init(); diff --git a/src/core/hle/service/nfc/nfc_m.cpp b/src/core/hle/service/nfc/nfc_m.cpp index f43b4029a..ebe637650 100644 --- a/src/core/hle/service/nfc/nfc_m.cpp +++ b/src/core/hle/service/nfc/nfc_m.cpp @@ -11,18 +11,19 @@ namespace NFC { const Interface::FunctionInfo FunctionTable[] = { // clang-format off // nfc:u shared commands - {0x00010040, nullptr, "Initialize"}, - {0x00020040, nullptr, "Shutdown"}, - {0x00030000, nullptr, "StartCommunication"}, - {0x00040000, nullptr, "StopCommunication"}, - {0x00050040, nullptr, "StartTagScanning"}, - {0x00060000, nullptr, "StopTagScanning"}, - {0x00070000, nullptr, "LoadAmiiboData"}, - {0x00080000, nullptr, "ResetTagScanState"}, + {0x00010040, Initialize, "Initialize"}, + {0x00020040, Shutdown, "Shutdown"}, + {0x00030000, StartCommunication, "StartCommunication"}, + {0x00040000, StopCommunication, "StopCommunication"}, + {0x00050040, StartTagScanning, "StartTagScanning"}, + {0x00060000, StopTagScanning, "StopTagScanning"}, + {0x00070000, LoadAmiiboData, "LoadAmiiboData"}, + {0x00080000, ResetTagScanState, "ResetTagScanState"}, {0x00090002, nullptr, "UpdateStoredAmiiboData"}, {0x000B0000, GetTagInRangeEvent, "GetTagInRangeEvent"}, - {0x000D0000, nullptr, "GetTagState"}, - {0x000F0000, nullptr, "CommunicationGetStatus"}, + {0x000C0000, GetTagOutOfRangeEvent, "GetTagOutOfRangeEvent"}, + {0x000D0000, GetTagState, "GetTagState"}, + {0x000F0000, CommunicationGetStatus, "CommunicationGetStatus"}, {0x00100000, nullptr, "GetTagInfo2"}, {0x00110000, nullptr, "GetTagInfo"}, {0x00120000, nullptr, "CommunicationGetResult"}, diff --git a/src/core/hle/service/nfc/nfc_u.cpp b/src/core/hle/service/nfc/nfc_u.cpp index 4b5200ae8..5a40c7874 100644 --- a/src/core/hle/service/nfc/nfc_u.cpp +++ b/src/core/hle/service/nfc/nfc_u.cpp @@ -10,18 +10,19 @@ namespace NFC { const Interface::FunctionInfo FunctionTable[] = { // clang-format off - {0x00010040, nullptr, "Initialize"}, - {0x00020040, nullptr, "Shutdown"}, - {0x00030000, nullptr, "StartCommunication"}, - {0x00040000, nullptr, "StopCommunication"}, - {0x00050040, nullptr, "StartTagScanning"}, - {0x00060000, nullptr, "StopTagScanning"}, - {0x00070000, nullptr, "LoadAmiiboData"}, - {0x00080000, nullptr, "ResetTagScanState"}, + {0x00010040, Initialize, "Initialize"}, + {0x00020040, Shutdown, "Shutdown"}, + {0x00030000, StartCommunication, "StartCommunication"}, + {0x00040000, StopCommunication, "StopCommunication"}, + {0x00050040, StartTagScanning, "StartTagScanning"}, + {0x00060000, StopTagScanning, "StopTagScanning"}, + {0x00070000, LoadAmiiboData, "LoadAmiiboData"}, + {0x00080000, ResetTagScanState, "ResetTagScanState"}, {0x00090002, nullptr, "UpdateStoredAmiiboData"}, {0x000B0000, GetTagInRangeEvent, "GetTagInRangeEvent"}, - {0x000D0000, nullptr, "GetTagState"}, - {0x000F0000, nullptr, "CommunicationGetStatus"}, + {0x000C0000, GetTagOutOfRangeEvent, "GetTagOutOfRangeEvent"}, + {0x000D0000, GetTagState, "GetTagState"}, + {0x000F0000, CommunicationGetStatus, "CommunicationGetStatus"}, {0x00100000, nullptr, "GetTagInfo2"}, {0x00110000, nullptr, "GetTagInfo"}, {0x00120000, nullptr, "CommunicationGetResult"}, diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp index 1a1ee90b2..fa8c13d36 100644 --- a/src/core/hw/gpu.cpp +++ b/src/core/hw/gpu.cpp @@ -15,7 +15,6 @@ #include "common/vector_math.h" #include "core/core_timing.h" #include "core/hle/service/gsp_gpu.h" -#include "core/hle/service/hid/hid.h" #include "core/hw/gpu.h" #include "core/hw/hw.h" #include "core/memory.h" @@ -33,7 +32,7 @@ namespace GPU { Regs g_regs; /// 268MHz CPU clocks / 60Hz frames per second -const u64 frame_ticks = 268123480ull / 60; +const u64 frame_ticks = BASE_CLOCK_RATE_ARM11 / 60; /// Event id for CoreTiming static int vblank_event; /// Total number of frames drawn @@ -551,9 +550,6 @@ static void VBlankCallback(u64 userdata, int cycles_late) { Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PDC0); Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PDC1); - // Check for user input updates - Service::HID::Update(); - if (!Settings::values.use_vsync && Settings::values.toggle_framelimit) { FrameLimiter(); } diff --git a/src/core/loader/3dsx.cpp b/src/core/loader/3dsx.cpp index 1c10740a0..09266e8b0 100644 --- a/src/core/loader/3dsx.cpp +++ b/src/core/loader/3dsx.cpp @@ -177,18 +177,34 @@ static THREEDSX_Error Load3DSXFile(FileUtil::IOFile& file, u32 base_addr, pos += table.skip; s32 num_patches = table.patch; while (0 < num_patches && pos < end_pos) { - u32 in_addr = - static_cast<u32>(reinterpret_cast<u8*>(pos) - program_image.data()); - u32 addr = TranslateAddr(*pos, &loadinfo, offsets); - LOG_TRACE(Loader, "Patching %08X <-- rel(%08X,%d) (%08X)", - base_addr + in_addr, addr, current_segment_reloc_table, *pos); + u32 in_addr = base_addr + static_cast<u32>(reinterpret_cast<u8*>(pos) - + program_image.data()); + u32 orig_data = *pos; + u32 sub_type = orig_data >> (32 - 4); + u32 addr = TranslateAddr(orig_data & ~0xF0000000, &loadinfo, offsets); + LOG_TRACE(Loader, "Patching %08X <-- rel(%08X,%d) (%08X)", in_addr, addr, + current_segment_reloc_table, *pos); switch (current_segment_reloc_table) { - case 0: - *pos = (addr); + case 0: { + if (sub_type != 0) + return ERROR_READ; + *pos = addr; break; - case 1: - *pos = static_cast<u32>(addr - in_addr); + } + case 1: { + u32 data = addr - in_addr; + switch (sub_type) { + case 0: // 32-bit signed offset + *pos = data; + break; + case 1: // 31-bit signed offset + *pos = data & ~(1U << 31); + break; + default: + return ERROR_READ; + } break; + } default: break; // this should never happen } diff --git a/src/core/settings.h b/src/core/settings.h index 8dbda653a..e22ce0f16 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -104,6 +104,7 @@ struct Values { // Audio std::string sink_id; bool enable_audio_stretching; + std::string audio_device_id; // Debugging bool use_gdbstub; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 6ca319b59..d55b84ce0 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -50,10 +50,12 @@ set(HEADERS if(ARCHITECTURE_x86_64) set(SRCS ${SRCS} - shader/shader_jit_x64.cpp) + shader/shader_jit_x64.cpp + shader/shader_jit_x64_compiler.cpp) set(HEADERS ${HEADERS} - shader/shader_jit_x64.h) + shader/shader_jit_x64.h + shader/shader_jit_x64_compiler.h) endif() create_directory_groups(${SRCS} ${HEADERS}) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index ea58e9f54..eb79974a8 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -142,16 +142,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { MICROPROFILE_SCOPE(GPU_Drawing); immediate_attribute_id = 0; - Shader::UnitState shader_unit; - g_state.vs.Setup(); + auto* shader_engine = Shader::GetEngine(); + shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); // Send to vertex shader if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast<void*>(&immediate_input)); - g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes + 1); - Shader::OutputVertex output_vertex = - shader_unit.output_registers.ToVertex(regs.vs); + Shader::UnitState shader_unit; + shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1); + shader_engine->Run(g_state.vs, shader_unit); + auto output_vertex = Shader::OutputVertex::FromRegisters( + shader_unit.registers.output, regs, regs.vs.output_mask); // Send to renderer using Pica::Shader::OutputVertex; @@ -243,8 +245,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { unsigned int vertex_cache_pos = 0; vertex_cache_ids.fill(-1); + auto* shader_engine = Shader::GetEngine(); Shader::UnitState shader_unit; - g_state.vs.Setup(); + + shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); for (unsigned int index = 0; index < regs.num_vertices; ++index) { // Indexed rendering doesn't use the start offset @@ -283,10 +287,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); - g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes()); + shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes()); + shader_engine->Run(g_state.vs, shader_unit); // Retrieve vertex from register data - output_vertex = shader_unit.output_registers.ToVertex(regs.vs); + output_vertex = Shader::OutputVertex::FromRegisters(shader_unit.registers.output, + regs, regs.vs.output_mask); if (is_indexed) { vertex_cache[vertex_cache_pos] = output_vertex; diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index ce2bd455e..b4a77c632 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -499,7 +499,7 @@ void Init() { } void Shutdown() { - Shader::ClearCache(); + Shader::Shutdown(); } template <typename T> diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 5a306a5c8..f3674e965 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -716,8 +716,6 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(PAddr addr, u32 size) { bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) { MICROPROFILE_SCOPE(OpenGL_Blits); - using PixelFormat = CachedSurface::PixelFormat; - using SurfaceType = CachedSurface::SurfaceType; CachedSurface src_params; src_params.addr = config.GetPhysicalInputAddress(); @@ -748,7 +746,8 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe // Adjust the source rectangle to take into account parts of the input lines being cropped if (config.input_width > config.output_width) { - src_rect.right -= (config.input_width - config.output_width) * src_surface->res_scale_width; + src_rect.right -= static_cast<int>((config.input_width - config.output_width) * + src_surface->res_scale_width); } // Require destination surface to have same resolution scale as source to preserve scaling diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index e1a9cb361..cc3e4bed5 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -76,7 +76,7 @@ union PicaShaderConfig { } state.fog_mode = regs.fog_mode; - state.fog_flip = regs.fog_flip; + state.fog_flip = regs.fog_flip != 0; state.combiner_buffer_input = regs.tev_combiner_buffer_input.update_mask_rgb.Value() | regs.tev_combiner_buffer_input.update_mask_a.Value() << 4; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index ef3b06a7b..1e7eedecb 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -172,7 +172,6 @@ bool RasterizerCacheOpenGL::TryBlitSurfaces(CachedSurface* src_surface, const MathUtil::Rectangle<int>& src_rect, CachedSurface* dst_surface, const MathUtil::Rectangle<int>& dst_rect) { - using SurfaceType = CachedSurface::SurfaceType; if (!CachedSurface::CheckFormatsBlittable(src_surface->pixel_format, dst_surface->pixel_format)) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index b50e8292b..f57fdb3cc 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -8,7 +8,14 @@ #include <memory> #include <set> #include <tuple> +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedef" +#endif #include <boost/icl/interval_map.hpp> +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif #include <glad/glad.h> #include "common/assert.h" #include "common/common_funcs.h" diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index a4aa3c9e0..2da50bd62 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -2,14 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <atomic> #include <cmath> #include <cstring> -#include <unordered_map> -#include <utility> -#include <boost/range/algorithm/fill.hpp> -#include "common/bit_field.h" -#include "common/hash.h" #include "common/logging/log.h" #include "common/microprofile.h" #include "video_core/pica.h" @@ -25,7 +19,8 @@ namespace Pica { namespace Shader { -OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { +OutputVertex OutputVertex::FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs, + u32 output_mask) { // Setup output data OutputVertex ret; // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to @@ -33,13 +28,13 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { unsigned index = 0; for (unsigned i = 0; i < 7; ++i) { - if (index >= g_state.regs.vs_output_total) + if (index >= regs.vs_output_total) break; - if ((config.output_mask & (1 << i)) == 0) + if ((output_mask & (1 << i)) == 0) continue; - const auto& output_register_map = g_state.regs.vs_output_attributes[index]; + const auto& output_register_map = regs.vs_output_attributes[index]; u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y, output_register_map.map_z, output_register_map.map_w}; @@ -47,7 +42,7 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { for (unsigned comp = 0; comp < 4; ++comp) { float24* out = ((float24*)&ret) + semantics[comp]; if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { - *out = value[i][comp]; + *out = output_regs[i][comp]; } else { // Zero output so that attributes which aren't output won't have denormals in them, // which would slow us down later. @@ -76,86 +71,41 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { return ret; } -#ifdef ARCHITECTURE_x86_64 -static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map; -static const JitShader* jit_shader; -#endif // ARCHITECTURE_x86_64 +void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) { + // Setup input register table + const auto& attribute_register_map = g_state.regs.vs.input_register_map; + + for (int i = 0; i < num_attributes; i++) + registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; +} + +MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); -void ClearCache() { #ifdef ARCHITECTURE_x86_64 - shader_map.clear(); +static std::unique_ptr<JitX64Engine> jit_engine; #endif // ARCHITECTURE_x86_64 -} +static InterpreterEngine interpreter_engine; -void ShaderSetup::Setup() { +ShaderEngine* GetEngine() { #ifdef ARCHITECTURE_x86_64 + // TODO(yuriks): Re-initialize on each change rather than being persistent if (VideoCore::g_shader_jit_enabled) { - u64 cache_key = - Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ - Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)); - - auto iter = shader_map.find(cache_key); - if (iter != shader_map.end()) { - jit_shader = iter->second.get(); - } else { - auto shader = std::make_unique<JitShader>(); - shader->Compile(); - jit_shader = shader.get(); - shader_map[cache_key] = std::move(shader); + if (jit_engine == nullptr) { + jit_engine = std::make_unique<JitX64Engine>(); } + return jit_engine.get(); } #endif // ARCHITECTURE_x86_64 -} - -MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); - -void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes) { - auto& config = g_state.regs.vs; - auto& setup = g_state.vs; - - MICROPROFILE_SCOPE(GPU_Shader); - // Setup input register table - const auto& attribute_register_map = config.input_register_map; - - for (unsigned i = 0; i < num_attributes; i++) - state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; + return &interpreter_engine; +} +void Shutdown() { #ifdef ARCHITECTURE_x86_64 - if (VideoCore::g_shader_jit_enabled) { - jit_shader->Run(setup, state, config.main_offset); - } else { - DebugData<false> dummy_debug_data; - RunInterpreter(setup, state, dummy_debug_data, config.main_offset); - } -#else - DebugData<false> dummy_debug_data; - RunInterpreter(setup, state, dummy_debug_data, config.main_offset); + jit_engine = nullptr; #endif // ARCHITECTURE_x86_64 } -DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, - const Regs::ShaderConfig& config, - const ShaderSetup& setup) { - UnitState state; - DebugData<true> debug_data; - - // Setup input register table - boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero())); - const auto& attribute_register_map = config.input_register_map; - for (unsigned i = 0; i < num_attributes; i++) - state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; - - RunInterpreter(setup, state, debug_data, config.main_offset); - return debug_data; -} - } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 2b07759b9..44d9f76c3 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -6,7 +6,6 @@ #include <array> #include <cstddef> -#include <memory> #include <type_traits> #include <nihstro/shader_bytecode.h> #include "common/assert.h" @@ -15,7 +14,6 @@ #include "common/vector_math.h" #include "video_core/pica.h" #include "video_core/pica_types.h" -#include "video_core/shader/debug_data.h" using nihstro::RegisterType; using nihstro::SourceRegister; @@ -75,19 +73,13 @@ struct OutputVertex { ret.Lerp(factor, v1); return ret; } + + static OutputVertex FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs, + u32 output_mask); }; static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); -struct OutputRegisters { - OutputRegisters() = default; - - alignas(16) Math::Vec4<float24> value[16]; - - OutputVertex ToVertex(const Regs::ShaderConfig& config) const; -}; -static_assert(std::is_pod<OutputRegisters>::value, "Structure is not POD"); - /** * This structure contains the state information that needs to be unique for a shader unit. The 3DS * has four shader units that process shaders in parallel. At the present, Citra only implements a @@ -100,11 +92,10 @@ struct UnitState { // required to be 16-byte aligned. alignas(16) Math::Vec4<float24> input[16]; alignas(16) Math::Vec4<float24> temporary[16]; + alignas(16) Math::Vec4<float24> output[16]; } registers; static_assert(std::is_pod<Registers>::value, "Structure is not POD"); - OutputRegisters output_registers; - bool conditional_code[2]; // Two Address registers and one loop counter @@ -130,7 +121,7 @@ struct UnitState { static size_t OutputOffset(const DestRegister& reg) { switch (reg.GetRegisterType()) { case RegisterType::Output: - return offsetof(UnitState, output_registers.value) + + return offsetof(UnitState, registers.output) + reg.GetIndex() * sizeof(Math::Vec4<float24>); case RegisterType::Temporary: @@ -142,13 +133,17 @@ struct UnitState { return 0; } } -}; -/// Clears the shader cache -void ClearCache(); + /** + * Loads the unit state with an input vertex. + * + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes to load + */ + void LoadInputVertex(const InputVertex& input, int num_attributes); +}; struct ShaderSetup { - struct { // The float uniforms are accessed by the shader JIT using SSE instructions, and are // therefore required to be 16-byte aligned. @@ -173,32 +168,37 @@ struct ShaderSetup { std::array<u32, 1024> program_code; std::array<u32, 1024> swizzle_data; + /// Data private to ShaderEngines + struct EngineData { + unsigned int entry_point; + /// Used by the JIT, points to a compiled shader object. + const void* cached_shader = nullptr; + } engine_data; +}; + +class ShaderEngine { +public: + virtual ~ShaderEngine() = default; + /** * Performs any shader unit setup that only needs to happen once per shader (as opposed to once * per vertex, which would happen within the `Run` function). */ - void Setup(); - - /** - * Runs the currently setup shader - * @param state Shader unit state, must be setup per shader and per shader unit - * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes - */ - void Run(UnitState& state, const InputVertex& input, int num_attributes); + virtual void SetupBatch(ShaderSetup& setup, unsigned int entry_point) = 0; /** - * Produce debug information based on the given shader and input vertex - * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes - * @param config Configuration object for the shader pipeline - * @param setup Setup object for the shader pipeline - * @return Debug information for this shader with regards to the given vertex + * Runs the currently setup shader. + * + * @param setup Shader engine state, must be setup with SetupBatch on each shader change. + * @param state Shader unit state, must be setup with input data before each shader invocation. */ - DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, - const Regs::ShaderConfig& config, const ShaderSetup& setup); + virtual void Run(const ShaderSetup& setup, UnitState& state) const = 0; }; +// TODO(yuriks): Remove and make it non-global state somewhere +ShaderEngine* GetEngine(); +void Shutdown(); + } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 20fb9754b..c0c89b857 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -7,10 +7,12 @@ #include <cmath> #include <numeric> #include <boost/container/static_vector.hpp> +#include <boost/range/algorithm/fill.hpp> #include <nihstro/shader_bytecode.h> #include "common/assert.h" #include "common/common_types.h" #include "common/logging/log.h" +#include "common/microprofile.h" #include "common/vector_math.h" #include "video_core/pica_state.h" #include "video_core/pica_types.h" @@ -37,12 +39,15 @@ struct CallStackElement { }; template <bool Debug> -void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, - unsigned offset) { +static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, + unsigned offset) { // TODO: Is there a maximal size for this? boost::container::static_vector<CallStackElement, 16> call_stack; u32 program_counter = offset; + state.conditional_code[0] = false; + state.conditional_code[1] = false; + auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) { // -1 to make sure when incrementing the PC we end up at the correct offset @@ -73,9 +78,9 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug> } }; - const auto& uniforms = g_state.vs.uniforms; - const auto& swizzle_data = g_state.vs.swizzle_data; - const auto& program_code = g_state.vs.program_code; + const auto& uniforms = setup.uniforms; + const auto& swizzle_data = setup.swizzle_data; + const auto& program_code = setup.program_code; // Placeholder for invalid inputs static float24 dummy_vec4_float24[4]; @@ -170,7 +175,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug> float24* dest = (instr.common.dest.Value() < 0x10) - ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0] + ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] : dummy_vec4_float24; @@ -513,7 +518,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug> float24* dest = (instr.mad.dest.Value() < 0x10) - ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0] + ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] : dummy_vec4_float24; @@ -647,9 +652,33 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug> } } -// Explicit instantiation -template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<false>&, unsigned offset); -template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<true>&, unsigned offset); +void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { + ASSERT(entry_point < 1024); + setup.engine_data.entry_point = entry_point; +} + +MICROPROFILE_DECLARE(GPU_Shader); + +void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const { + + MICROPROFILE_SCOPE(GPU_Shader); + + DebugData<false> dummy_debug_data; + RunInterpreter(setup, state, dummy_debug_data, setup.engine_data.entry_point); +} + +DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup, + const InputVertex& input, + int num_attributes) const { + UnitState state; + DebugData<true> debug_data; + + // Setup input register table + boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero())); + state.LoadInputVertex(input, num_attributes); + RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point); + return debug_data; +} } // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index d31dcd7a6..d6c0e2d8c 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -4,18 +4,28 @@ #pragma once +#include "video_core/shader/debug_data.h" +#include "video_core/shader/shader.h" + namespace Pica { namespace Shader { -struct UnitState; - -template <bool Debug> -struct DebugData; - -template <bool Debug> -void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, - unsigned offset); +class InterpreterEngine final : public ShaderEngine { +public: + void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; + void Run(const ShaderSetup& setup, UnitState& state) const override; + + /** + * Produce debug information based on the given shader and input vertex + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @param config Configuration object for the shader pipeline + * @return Debug information for this shader with regards to the given vertex + */ + DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const InputVertex& input, + int num_attributes) const; +}; } // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index c588b778b..0ee0dd9ef 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -1,888 +1,48 @@ -// Copyright 2015 Citra Emulator Project +// Copyright 2016 Citra Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <algorithm> -#include <cmath> -#include <cstdint> -#include <nihstro/shader_bytecode.h> -#include <smmintrin.h> -#include <xmmintrin.h> -#include "common/assert.h" -#include "common/logging/log.h" -#include "common/vector_math.h" -#include "common/x64/cpu_detect.h" -#include "common/x64/xbyak_abi.h" -#include "common/x64/xbyak_util.h" -#include "video_core/pica_state.h" -#include "video_core/pica_types.h" +#include "common/hash.h" +#include "common/microprofile.h" #include "video_core/shader/shader.h" #include "video_core/shader/shader_jit_x64.h" - -using namespace Common::X64; -using namespace Xbyak::util; -using Xbyak::Label; -using Xbyak::Reg32; -using Xbyak::Reg64; -using Xbyak::Xmm; +#include "video_core/shader/shader_jit_x64_compiler.h" namespace Pica { - namespace Shader { -typedef void (JitShader::*JitFunction)(Instruction instr); - -const JitFunction instr_table[64] = { - &JitShader::Compile_ADD, // add - &JitShader::Compile_DP3, // dp3 - &JitShader::Compile_DP4, // dp4 - &JitShader::Compile_DPH, // dph - nullptr, // unknown - &JitShader::Compile_EX2, // ex2 - &JitShader::Compile_LG2, // lg2 - nullptr, // unknown - &JitShader::Compile_MUL, // mul - &JitShader::Compile_SGE, // sge - &JitShader::Compile_SLT, // slt - &JitShader::Compile_FLR, // flr - &JitShader::Compile_MAX, // max - &JitShader::Compile_MIN, // min - &JitShader::Compile_RCP, // rcp - &JitShader::Compile_RSQ, // rsq - nullptr, // unknown - nullptr, // unknown - &JitShader::Compile_MOVA, // mova - &JitShader::Compile_MOV, // mov - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - &JitShader::Compile_DPH, // dphi - nullptr, // unknown - &JitShader::Compile_SGE, // sgei - &JitShader::Compile_SLT, // slti - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - nullptr, // unknown - &JitShader::Compile_NOP, // nop - &JitShader::Compile_END, // end - nullptr, // break - &JitShader::Compile_CALL, // call - &JitShader::Compile_CALLC, // callc - &JitShader::Compile_CALLU, // callu - &JitShader::Compile_IF, // ifu - &JitShader::Compile_IF, // ifc - &JitShader::Compile_LOOP, // loop - nullptr, // emit - nullptr, // sete - &JitShader::Compile_JMP, // jmpc - &JitShader::Compile_JMP, // jmpu - &JitShader::Compile_CMP, // cmp - &JitShader::Compile_CMP, // cmp - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // madi - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad - &JitShader::Compile_MAD, // mad -}; - -// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can -// be used as scratch registers within a compiler function. The other registers have designated -// purposes, as documented below: +JitX64Engine::JitX64Engine() = default; +JitX64Engine::~JitX64Engine() = default; -/// Pointer to the uniform memory -static const Reg64 SETUP = r9; -/// The two 32-bit VS address offset registers set by the MOVA instruction -static const Reg64 ADDROFFS_REG_0 = r10; -static const Reg64 ADDROFFS_REG_1 = r11; -/// VS loop count register (Multiplied by 16) -static const Reg32 LOOPCOUNT_REG = r12d; -/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) -static const Reg32 LOOPCOUNT = esi; -/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) -static const Reg32 LOOPINC = edi; -/// Result of the previous CMP instruction for the X-component comparison -static const Reg64 COND0 = r13; -/// Result of the previous CMP instruction for the Y-component comparison -static const Reg64 COND1 = r14; -/// Pointer to the UnitState instance for the current VS unit -static const Reg64 STATE = r15; -/// SIMD scratch register -static const Xmm SCRATCH = xmm0; -/// Loaded with the first swizzled source register, otherwise can be used as a scratch register -static const Xmm SRC1 = xmm1; -/// Loaded with the second swizzled source register, otherwise can be used as a scratch register -static const Xmm SRC2 = xmm2; -/// Loaded with the third swizzled source register, otherwise can be used as a scratch register -static const Xmm SRC3 = xmm3; -/// Additional scratch register -static const Xmm SCRATCH2 = xmm4; -/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one -static const Xmm ONE = xmm14; -/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR -static const Xmm NEGBIT = xmm15; +void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { + ASSERT(entry_point < 1024); + setup.engine_data.entry_point = entry_point; -// State registers that must not be modified by external functions calls -// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed -static const BitSet32 persistent_regs = BuildRegSet({ - // Pointers to register blocks - SETUP, STATE, - // Cached registers - ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, - // Constants - ONE, NEGBIT, -}); + u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code)); + u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data)); -/// Raw constant for the source register selector that indicates no swizzling is performed -static const u8 NO_SRC_REG_SWIZZLE = 0x1b; -/// Raw constant for the destination register enable mask that indicates all components are enabled -static const u8 NO_DEST_REG_MASK = 0xf; - -/** - * Get the vertex shader instruction for a given offset in the current shader program - * @param offset Offset in the current shader program of the instruction - * @return Instruction at the specified offset - */ -static Instruction GetVertexShaderInstruction(size_t offset) { - return {g_state.vs.program_code[offset]}; -} - -static void LogCritical(const char* msg) { - LOG_CRITICAL(HW_GPU, "%s", msg); -} - -void JitShader::Compile_Assert(bool condition, const char* msg) { - if (!condition) { - mov(ABI_PARAM1, reinterpret_cast<size_t>(msg)); - CallFarFunction(*this, LogCritical); - } -} - -/** - * Loads and swizzles a source register into the specified XMM register. - * @param instr VS instruction, used for determining how to load the source register - * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) - * @param src_reg SourceRegister object corresponding to the source register to load - * @param dest Destination XMM register to store the loaded, swizzled source register - */ -void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, - Xmm dest) { - Reg64 src_ptr; - size_t src_offset; - - if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { - src_ptr = SETUP; - src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex()); + u64 cache_key = code_hash ^ swizzle_hash; + auto iter = cache.find(cache_key); + if (iter != cache.end()) { + setup.engine_data.cached_shader = iter->second.get(); } else { - src_ptr = STATE; - src_offset = UnitState::InputOffset(src_reg); - } - - int src_offset_disp = (int)src_offset; - ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type"); - - unsigned operand_desc_id; - - const bool is_inverted = - (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); - - unsigned address_register_index; - unsigned offset_src; - - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || - instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { - operand_desc_id = instr.mad.operand_desc_id; - offset_src = is_inverted ? 3 : 2; - address_register_index = instr.mad.address_register_index; - } else { - operand_desc_id = instr.common.operand_desc_id; - offset_src = is_inverted ? 2 : 1; - address_register_index = instr.common.address_register_index; - } - - if (src_num == offset_src && address_register_index != 0) { - switch (address_register_index) { - case 1: // address offset 1 - movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); - break; - case 2: // address offset 2 - movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); - break; - case 3: // address offset 3 - movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); - break; - default: - UNREACHABLE(); - break; - } - } else { - // Load the source - movaps(dest, xword[src_ptr + src_offset_disp]); - } - - SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; - - // Generate instructions for source register swizzling as needed - u8 sel = swiz.GetRawSelector(src_num); - if (sel != NO_SRC_REG_SWIZZLE) { - // Selector component order needs to be reversed for the SHUFPS instruction - sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); - - // Shuffle inputs for swizzle - shufps(dest, dest, sel); - } - - // If the source register should be negated, flip the negative bit using XOR - const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; - if (negate[src_num - 1]) { - xorps(dest, NEGBIT); + auto shader = std::make_unique<JitShader>(); + shader->Compile(&setup.program_code, &setup.swizzle_data); + setup.engine_data.cached_shader = shader.get(); + cache.emplace_hint(iter, cache_key, std::move(shader)); } } -void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { - DestRegister dest; - unsigned operand_desc_id; - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || - instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { - operand_desc_id = instr.mad.operand_desc_id; - dest = instr.mad.dest.Value(); - } else { - operand_desc_id = instr.common.operand_desc_id; - dest = instr.common.dest.Value(); - } - - SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; - - size_t dest_offset_disp = UnitState::OutputOffset(dest); - - // If all components are enabled, write the result to the destination register - if (swiz.dest_mask == NO_DEST_REG_MASK) { - // Store dest back to memory - movaps(xword[STATE + dest_offset_disp], src); - - } else { - // Not all components are enabled, so mask the result when storing to the destination - // register... - movaps(SCRATCH, xword[STATE + dest_offset_disp]); - - if (Common::GetCPUCaps().sse4_1) { - u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | - ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); - blendps(SCRATCH, src, mask); - } else { - movaps(SCRATCH2, src); - unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination - unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination - - // Compute selector to selectively copy source components to destination for SHUFPS - // instruction - u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | - ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | - ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | - ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); - shufps(SCRATCH, SCRATCH2, sel); - } - - // Store dest back to memory - movaps(xword[STATE + dest_offset_disp], SCRATCH); - } -} - -void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { - movaps(scratch, src1); - cmpordps(scratch, src2); - - mulps(src1, src2); +MICROPROFILE_DECLARE(GPU_Shader); - movaps(src2, src1); - cmpunordps(src2, src2); +void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state) const { + ASSERT(setup.engine_data.cached_shader != nullptr); - xorps(scratch, src2); - andps(src1, scratch); -} - -void JitShader::Compile_EvaluateCondition(Instruction instr) { - // Note: NXOR is used below to check for equality - switch (instr.flow_control.op) { - case Instruction::FlowControlType::Or: - mov(eax, COND0); - mov(ebx, COND1); - xor(eax, (instr.flow_control.refx.Value() ^ 1)); - xor(ebx, (instr.flow_control.refy.Value() ^ 1)); - or (eax, ebx); - break; - - case Instruction::FlowControlType::And: - mov(eax, COND0); - mov(ebx, COND1); - xor(eax, (instr.flow_control.refx.Value() ^ 1)); - xor(ebx, (instr.flow_control.refy.Value() ^ 1)); - and(eax, ebx); - break; - - case Instruction::FlowControlType::JustX: - mov(eax, COND0); - xor(eax, (instr.flow_control.refx.Value() ^ 1)); - break; - - case Instruction::FlowControlType::JustY: - mov(eax, COND1); - xor(eax, (instr.flow_control.refy.Value() ^ 1)); - break; - } -} + MICROPROFILE_SCOPE(GPU_Shader); -void JitShader::Compile_UniformCondition(Instruction instr) { - size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); - cmp(byte[SETUP + offset], 0); + const JitShader* shader = static_cast<const JitShader*>(setup.engine_data.cached_shader); + shader->Run(setup, state, setup.engine_data.entry_point); } -BitSet32 JitShader::PersistentCallerSavedRegs() { - return persistent_regs & ABI_ALL_CALLER_SAVED; -} - -void JitShader::Compile_ADD(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - addps(SRC1, SRC2); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_DP3(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - - movaps(SRC2, SRC1); - shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); - - movaps(SRC3, SRC1); - shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); - - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); - addps(SRC1, SRC2); - addps(SRC1, SRC3); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_DP4(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - addps(SRC1, SRC2); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - addps(SRC1, SRC2); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_DPH(Instruction instr) { - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); - } else { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - } - - if (Common::GetCPUCaps().sse4_1) { - // Set 4th component to 1.0 - blendps(SRC1, ONE, 0b1000); - } else { - // Set 4th component to 1.0 - movaps(SCRATCH, SRC1); - unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__ - unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1 - } - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - addps(SRC1, SRC2); - - movaps(SRC2, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - addps(SRC1, SRC2); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_EX2(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - movss(xmm0, SRC1); // ABI_PARAM1 - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - CallFarFunction(*this, exp2f); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN - movaps(SRC1, xmm0); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_LG2(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - movss(xmm0, SRC1); // ABI_PARAM1 - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - CallFarFunction(*this, log2f); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN - movaps(SRC1, xmm0); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MUL(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_SGE(Instruction instr) { - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); - } else { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - } - - cmpleps(SRC2, SRC1); - andps(SRC2, ONE); - - Compile_DestEnable(instr, SRC2); -} - -void JitShader::Compile_SLT(Instruction instr) { - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { - Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); - } else { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - } - - cmpltps(SRC1, SRC2); - andps(SRC1, ONE); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_FLR(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - if (Common::GetCPUCaps().sse4_1) { - roundps(SRC1, SRC1, _MM_FROUND_FLOOR); - } else { - cvttps2dq(SRC1, SRC1); - cvtdq2ps(SRC1, SRC1); - } - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MAX(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. - maxps(SRC1, SRC2); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MIN(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. - minps(SRC1, SRC2); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_MOVA(Instruction instr) { - SwizzlePattern swiz = {g_state.vs.swizzle_data[instr.common.operand_desc_id]}; - - if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { - return; // NoOp - } - - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - // Convert floats to integers using truncation (only care about X and Y components) - cvttps2dq(SRC1, SRC1); - - // Get result - movq(rax, SRC1); - - // Handle destination enable - if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { - // Move and sign-extend low 32 bits - movsxd(ADDROFFS_REG_0, eax); - - // Move and sign-extend high 32 bits - shr(rax, 32); - movsxd(ADDROFFS_REG_1, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_0, 4); - shl(ADDROFFS_REG_1, 4); - } else { - if (swiz.DestComponentEnabled(0)) { - // Move and sign-extend low 32 bits - movsxd(ADDROFFS_REG_0, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_0, 4); - } else if (swiz.DestComponentEnabled(1)) { - // Move and sign-extend high 32 bits - shr(rax, 32); - movsxd(ADDROFFS_REG_1, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_1, 4); - } - } -} - -void JitShader::Compile_MOV(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_RCP(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica - // performs this operation more accurately. This should be checked on hardware. - rcpss(SRC1, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_RSQ(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - - // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica - // performs this operation more accurately. This should be checked on hardware. - rsqrtss(SRC1, SRC1); - shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_NOP(Instruction instr) {} - -void JitShader::Compile_END(Instruction instr) { - ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); - ret(); -} - -void JitShader::Compile_CALL(Instruction instr) { - // Push offset of the return - push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions)); - - // Call the subroutine - call(instruction_labels[instr.flow_control.dest_offset]); - - // Skip over the return offset that's on the stack - add(rsp, 8); -} - -void JitShader::Compile_CALLC(Instruction instr) { - Compile_EvaluateCondition(instr); - Label b; - jz(b); - Compile_CALL(instr); - L(b); -} - -void JitShader::Compile_CALLU(Instruction instr) { - Compile_UniformCondition(instr); - Label b; - jz(b); - Compile_CALL(instr); - L(b); -} - -void JitShader::Compile_CMP(Instruction instr) { - using Op = Instruction::Common::CompareOpType::Op; - Op op_x = instr.common.compare_op.x; - Op op_y = instr.common.compare_op.y; - - Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - - // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to - // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here - // because they don't match when used with NaNs. - static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; - - bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); - Xmm lhs_x = invert_op_x ? SRC2 : SRC1; - Xmm rhs_x = invert_op_x ? SRC1 : SRC2; - - if (op_x == op_y) { - // Compare X-component and Y-component together - cmpps(lhs_x, rhs_x, cmp[op_x]); - movq(COND0, lhs_x); - - mov(COND1, COND0); - } else { - bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); - Xmm lhs_y = invert_op_y ? SRC2 : SRC1; - Xmm rhs_y = invert_op_y ? SRC1 : SRC2; - - // Compare X-component - movaps(SCRATCH, lhs_x); - cmpss(SCRATCH, rhs_x, cmp[op_x]); - - // Compare Y-component - cmpps(lhs_y, rhs_y, cmp[op_y]); - - movq(COND0, SCRATCH); - movq(COND1, lhs_y); - } - - shr(COND0.cvt32(), 31); // ignores upper 32 bits in source - shr(COND1, 63); -} - -void JitShader::Compile_MAD(Instruction instr) { - Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); - - if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { - Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); - Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); - } else { - Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); - Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); - } - - Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - addps(SRC1, SRC3); - - Compile_DestEnable(instr, SRC1); -} - -void JitShader::Compile_IF(Instruction instr) { - Compile_Assert(instr.flow_control.dest_offset >= program_counter, - "Backwards if-statements not supported"); - Label l_else, l_endif; - - // Evaluate the "IF" condition - if (instr.opcode.Value() == OpCode::Id::IFU) { - Compile_UniformCondition(instr); - } else if (instr.opcode.Value() == OpCode::Id::IFC) { - Compile_EvaluateCondition(instr); - } - jz(l_else, T_NEAR); - - // Compile the code that corresponds to the condition evaluating as true - Compile_Block(instr.flow_control.dest_offset); - - // If there isn't an "ELSE" condition, we are done here - if (instr.flow_control.num_instructions == 0) { - L(l_else); - return; - } - - jmp(l_endif, T_NEAR); - - L(l_else); - // This code corresponds to the "ELSE" condition - // Comple the code that corresponds to the condition evaluating as false - Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); - - L(l_endif); -} - -void JitShader::Compile_LOOP(Instruction instr) { - Compile_Assert(instr.flow_control.dest_offset >= program_counter, - "Backwards loops not supported"); - Compile_Assert(!looping, "Nested loops not supported"); - - looping = true; - - // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. - // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by - // 4 bits) to be used as an offset into the 16-byte vector registers later - size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id); - mov(LOOPCOUNT, dword[SETUP + offset]); - mov(LOOPCOUNT_REG, LOOPCOUNT); - shr(LOOPCOUNT_REG, 4); - and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start - mov(LOOPINC, LOOPCOUNT); - shr(LOOPINC, 12); - and(LOOPINC, 0xFF0); // Z-component is the incrementer - movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count - add(LOOPCOUNT, 1); // Iteration count is X-component + 1 - - Label l_loop_start; - L(l_loop_start); - - Compile_Block(instr.flow_control.dest_offset + 1); - - add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component - sub(LOOPCOUNT, 1); // Increment loop count by 1 - jnz(l_loop_start); // Loop if not equal - - looping = false; -} - -void JitShader::Compile_JMP(Instruction instr) { - if (instr.opcode.Value() == OpCode::Id::JMPC) - Compile_EvaluateCondition(instr); - else if (instr.opcode.Value() == OpCode::Id::JMPU) - Compile_UniformCondition(instr); - else - UNREACHABLE(); - - bool inverted_condition = - (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); - - Label& b = instruction_labels[instr.flow_control.dest_offset]; - if (inverted_condition) { - jz(b, T_NEAR); - } else { - jnz(b, T_NEAR); - } -} - -void JitShader::Compile_Block(unsigned end) { - while (program_counter < end) { - Compile_NextInstr(); - } -} - -void JitShader::Compile_Return() { - // Peek return offset on the stack and check if we're at that offset - mov(rax, qword[rsp + 8]); - cmp(eax, (program_counter)); - - // If so, jump back to before CALL - Label b; - jnz(b); - ret(); - L(b); -} - -void JitShader::Compile_NextInstr() { - if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { - Compile_Return(); - } - - L(instruction_labels[program_counter]); - - Instruction instr = GetVertexShaderInstruction(program_counter++); - - OpCode::Id opcode = instr.opcode.Value(); - auto instr_func = instr_table[static_cast<unsigned>(opcode)]; - - if (instr_func) { - // JIT the instruction! - ((*this).*instr_func)(instr); - } else { - // Unhandled instruction - LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", - instr.opcode.Value().EffectiveOpCode(), instr.hex); - } -} - -void JitShader::FindReturnOffsets() { - return_offsets.clear(); - - for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { - Instruction instr = GetVertexShaderInstruction(offset); - - switch (instr.opcode.Value()) { - case OpCode::Id::CALL: - case OpCode::Id::CALLC: - case OpCode::Id::CALLU: - return_offsets.push_back(instr.flow_control.dest_offset + - instr.flow_control.num_instructions); - break; - default: - break; - } - } - - // Sort for efficient binary search later - std::sort(return_offsets.begin(), return_offsets.end()); -} - -void JitShader::Compile() { - // Reset flow control state - program = (CompiledShader*)getCurr(); - program_counter = 0; - looping = false; - instruction_labels.fill(Xbyak::Label()); - - // Find all `CALL` instructions and identify return locations - FindReturnOffsets(); - - // The stack pointer is 8 modulo 16 at the entry of a procedure - ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); - - mov(SETUP, ABI_PARAM1); - mov(STATE, ABI_PARAM2); - - // Zero address/loop registers - xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32()); - xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32()); - xor(LOOPCOUNT_REG, LOOPCOUNT_REG); - - // Used to set a register to one - static const __m128 one = {1.f, 1.f, 1.f, 1.f}; - mov(rax, reinterpret_cast<size_t>(&one)); - movaps(ONE, xword[rax]); - - // Used to negate registers - static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; - mov(rax, reinterpret_cast<size_t>(&neg)); - movaps(NEGBIT, xword[rax]); - - // Jump to start of the shader program - jmp(ABI_PARAM3); - - // Compile entire program - Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size())); - - // Free memory that's no longer needed - return_offsets.clear(); - return_offsets.shrink_to_fit(); - - ready(); - - uintptr_t size = reinterpret_cast<uintptr_t>(getCurr()) - reinterpret_cast<uintptr_t>(program); - ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); - LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size); -} - -JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} - } // namespace Shader - } // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index f37548306..078b2cba5 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -1,121 +1,30 @@ -// Copyright 2015 Citra Emulator Project +// Copyright 2016 Citra Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once -#include <array> -#include <cstddef> -#include <utility> -#include <vector> -#include <nihstro/shader_bytecode.h> -#include <xbyak.h> -#include "common/bit_set.h" +#include <memory> +#include <unordered_map> #include "common/common_types.h" -#include "common/x64/emitter.h" #include "video_core/shader/shader.h" -using nihstro::Instruction; -using nihstro::OpCode; -using nihstro::SwizzlePattern; - namespace Pica { - namespace Shader { -/// Memory allocated for each compiled shader (64Kb) -constexpr size_t MAX_SHADER_SIZE = 1024 * 64; +class JitShader; -/** - * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 - * code that can be executed on the host machine directly. - */ -class JitShader : public Xbyak::CodeGenerator { +class JitX64Engine final : public ShaderEngine { public: - JitShader(); - - void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const { - program(&setup, &state, instruction_labels[offset].getAddress()); - } - - void Compile(); + JitX64Engine(); + ~JitX64Engine() override; - void Compile_ADD(Instruction instr); - void Compile_DP3(Instruction instr); - void Compile_DP4(Instruction instr); - void Compile_DPH(Instruction instr); - void Compile_EX2(Instruction instr); - void Compile_LG2(Instruction instr); - void Compile_MUL(Instruction instr); - void Compile_SGE(Instruction instr); - void Compile_SLT(Instruction instr); - void Compile_FLR(Instruction instr); - void Compile_MAX(Instruction instr); - void Compile_MIN(Instruction instr); - void Compile_RCP(Instruction instr); - void Compile_RSQ(Instruction instr); - void Compile_MOVA(Instruction instr); - void Compile_MOV(Instruction instr); - void Compile_NOP(Instruction instr); - void Compile_END(Instruction instr); - void Compile_CALL(Instruction instr); - void Compile_CALLC(Instruction instr); - void Compile_CALLU(Instruction instr); - void Compile_IF(Instruction instr); - void Compile_LOOP(Instruction instr); - void Compile_JMP(Instruction instr); - void Compile_CMP(Instruction instr); - void Compile_MAD(Instruction instr); + void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; + void Run(const ShaderSetup& setup, UnitState& state) const override; private: - void Compile_Block(unsigned end); - void Compile_NextInstr(); - - void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, - Xbyak::Xmm dest); - void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest); - - /** - * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying - * zero by inf. Clobbers `src2` and `scratch`. - */ - void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); - - void Compile_EvaluateCondition(Instruction instr); - void Compile_UniformCondition(Instruction instr); - - /** - * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. - */ - void Compile_Return(); - - BitSet32 PersistentCallerSavedRegs(); - - /** - * Assertion evaluated at compile-time, but only triggered if executed at runtime. - * @param msg Message to be logged if the assertion fails. - */ - void Compile_Assert(bool condition, const char* msg); - - /** - * Analyzes the entire shader program for `CALL` instructions before emitting any code, - * identifying the locations where a return needs to be inserted. - */ - void FindReturnOffsets(); - - /// Mapping of Pica VS instructions to pointers in the emitted code - std::array<Xbyak::Label, 1024> instruction_labels; - - /// Offsets in code where a return needs to be inserted - std::vector<unsigned> return_offsets; - - unsigned program_counter = 0; ///< Offset of the next instruction to decode - bool looping = false; ///< True if compiling a loop, used to check for nested loops - - using CompiledShader = void(const void* setup, void* state, const u8* start_addr); - CompiledShader* program = nullptr; + std::unordered_map<u64, std::unique_ptr<JitShader>> cache; }; -} // Shader - -} // Pica +} // namespace Shader +} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp new file mode 100644 index 000000000..49806e8c9 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -0,0 +1,884 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cmath> +#include <cstdint> +#include <nihstro/shader_bytecode.h> +#include <smmintrin.h> +#include <xmmintrin.h> +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/vector_math.h" +#include "common/x64/cpu_detect.h" +#include "common/x64/xbyak_abi.h" +#include "common/x64/xbyak_util.h" +#include "video_core/pica_state.h" +#include "video_core/pica_types.h" +#include "video_core/shader/shader.h" +#include "video_core/shader/shader_jit_x64_compiler.h" + +using namespace Common::X64; +using namespace Xbyak::util; +using Xbyak::Label; +using Xbyak::Reg32; +using Xbyak::Reg64; +using Xbyak::Xmm; + +namespace Pica { + +namespace Shader { + +typedef void (JitShader::*JitFunction)(Instruction instr); + +const JitFunction instr_table[64] = { + &JitShader::Compile_ADD, // add + &JitShader::Compile_DP3, // dp3 + &JitShader::Compile_DP4, // dp4 + &JitShader::Compile_DPH, // dph + nullptr, // unknown + &JitShader::Compile_EX2, // ex2 + &JitShader::Compile_LG2, // lg2 + nullptr, // unknown + &JitShader::Compile_MUL, // mul + &JitShader::Compile_SGE, // sge + &JitShader::Compile_SLT, // slt + &JitShader::Compile_FLR, // flr + &JitShader::Compile_MAX, // max + &JitShader::Compile_MIN, // min + &JitShader::Compile_RCP, // rcp + &JitShader::Compile_RSQ, // rsq + nullptr, // unknown + nullptr, // unknown + &JitShader::Compile_MOVA, // mova + &JitShader::Compile_MOV, // mov + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + &JitShader::Compile_DPH, // dphi + nullptr, // unknown + &JitShader::Compile_SGE, // sgei + &JitShader::Compile_SLT, // slti + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + &JitShader::Compile_NOP, // nop + &JitShader::Compile_END, // end + nullptr, // break + &JitShader::Compile_CALL, // call + &JitShader::Compile_CALLC, // callc + &JitShader::Compile_CALLU, // callu + &JitShader::Compile_IF, // ifu + &JitShader::Compile_IF, // ifc + &JitShader::Compile_LOOP, // loop + nullptr, // emit + nullptr, // sete + &JitShader::Compile_JMP, // jmpc + &JitShader::Compile_JMP, // jmpu + &JitShader::Compile_CMP, // cmp + &JitShader::Compile_CMP, // cmp + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad +}; + +// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can +// be used as scratch registers within a compiler function. The other registers have designated +// purposes, as documented below: + +/// Pointer to the uniform memory +static const Reg64 SETUP = r9; +/// The two 32-bit VS address offset registers set by the MOVA instruction +static const Reg64 ADDROFFS_REG_0 = r10; +static const Reg64 ADDROFFS_REG_1 = r11; +/// VS loop count register (Multiplied by 16) +static const Reg32 LOOPCOUNT_REG = r12d; +/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) +static const Reg32 LOOPCOUNT = esi; +/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) +static const Reg32 LOOPINC = edi; +/// Result of the previous CMP instruction for the X-component comparison +static const Reg64 COND0 = r13; +/// Result of the previous CMP instruction for the Y-component comparison +static const Reg64 COND1 = r14; +/// Pointer to the UnitState instance for the current VS unit +static const Reg64 STATE = r15; +/// SIMD scratch register +static const Xmm SCRATCH = xmm0; +/// Loaded with the first swizzled source register, otherwise can be used as a scratch register +static const Xmm SRC1 = xmm1; +/// Loaded with the second swizzled source register, otherwise can be used as a scratch register +static const Xmm SRC2 = xmm2; +/// Loaded with the third swizzled source register, otherwise can be used as a scratch register +static const Xmm SRC3 = xmm3; +/// Additional scratch register +static const Xmm SCRATCH2 = xmm4; +/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one +static const Xmm ONE = xmm14; +/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR +static const Xmm NEGBIT = xmm15; + +// State registers that must not be modified by external functions calls +// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed +static const BitSet32 persistent_regs = BuildRegSet({ + // Pointers to register blocks + SETUP, STATE, + // Cached registers + ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, + // Constants + ONE, NEGBIT, +}); + +/// Raw constant for the source register selector that indicates no swizzling is performed +static const u8 NO_SRC_REG_SWIZZLE = 0x1b; +/// Raw constant for the destination register enable mask that indicates all components are enabled +static const u8 NO_DEST_REG_MASK = 0xf; + +static void LogCritical(const char* msg) { + LOG_CRITICAL(HW_GPU, "%s", msg); +} + +void JitShader::Compile_Assert(bool condition, const char* msg) { + if (!condition) { + mov(ABI_PARAM1, reinterpret_cast<size_t>(msg)); + CallFarFunction(*this, LogCritical); + } +} + +/** + * Loads and swizzles a source register into the specified XMM register. + * @param instr VS instruction, used for determining how to load the source register + * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) + * @param src_reg SourceRegister object corresponding to the source register to load + * @param dest Destination XMM register to store the loaded, swizzled source register + */ +void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, + Xmm dest) { + Reg64 src_ptr; + size_t src_offset; + + if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { + src_ptr = SETUP; + src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex()); + } else { + src_ptr = STATE; + src_offset = UnitState::InputOffset(src_reg); + } + + int src_offset_disp = (int)src_offset; + ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type"); + + unsigned operand_desc_id; + + const bool is_inverted = + (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); + + unsigned address_register_index; + unsigned offset_src; + + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + operand_desc_id = instr.mad.operand_desc_id; + offset_src = is_inverted ? 3 : 2; + address_register_index = instr.mad.address_register_index; + } else { + operand_desc_id = instr.common.operand_desc_id; + offset_src = is_inverted ? 2 : 1; + address_register_index = instr.common.address_register_index; + } + + if (src_num == offset_src && address_register_index != 0) { + switch (address_register_index) { + case 1: // address offset 1 + movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); + break; + case 2: // address offset 2 + movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); + break; + case 3: // address offset 3 + movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); + break; + default: + UNREACHABLE(); + break; + } + } else { + // Load the source + movaps(dest, xword[src_ptr + src_offset_disp]); + } + + SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; + + // Generate instructions for source register swizzling as needed + u8 sel = swiz.GetRawSelector(src_num); + if (sel != NO_SRC_REG_SWIZZLE) { + // Selector component order needs to be reversed for the SHUFPS instruction + sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); + + // Shuffle inputs for swizzle + shufps(dest, dest, sel); + } + + // If the source register should be negated, flip the negative bit using XOR + const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; + if (negate[src_num - 1]) { + xorps(dest, NEGBIT); + } +} + +void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { + DestRegister dest; + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + operand_desc_id = instr.mad.operand_desc_id; + dest = instr.mad.dest.Value(); + } else { + operand_desc_id = instr.common.operand_desc_id; + dest = instr.common.dest.Value(); + } + + SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; + + size_t dest_offset_disp = UnitState::OutputOffset(dest); + + // If all components are enabled, write the result to the destination register + if (swiz.dest_mask == NO_DEST_REG_MASK) { + // Store dest back to memory + movaps(xword[STATE + dest_offset_disp], src); + + } else { + // Not all components are enabled, so mask the result when storing to the destination + // register... + movaps(SCRATCH, xword[STATE + dest_offset_disp]); + + if (Common::GetCPUCaps().sse4_1) { + u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | + ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); + blendps(SCRATCH, src, mask); + } else { + movaps(SCRATCH2, src); + unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination + unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination + + // Compute selector to selectively copy source components to destination for SHUFPS + // instruction + u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | + ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | + ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | + ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); + shufps(SCRATCH, SCRATCH2, sel); + } + + // Store dest back to memory + movaps(xword[STATE + dest_offset_disp], SCRATCH); + } +} + +void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { + movaps(scratch, src1); + cmpordps(scratch, src2); + + mulps(src1, src2); + + movaps(src2, src1); + cmpunordps(src2, src2); + + xorps(scratch, src2); + andps(src1, scratch); +} + +void JitShader::Compile_EvaluateCondition(Instruction instr) { + // Note: NXOR is used below to check for equality + switch (instr.flow_control.op) { + case Instruction::FlowControlType::Or: + mov(eax, COND0); + mov(ebx, COND1); + xor(eax, (instr.flow_control.refx.Value() ^ 1)); + xor(ebx, (instr.flow_control.refy.Value() ^ 1)); + or (eax, ebx); + break; + + case Instruction::FlowControlType::And: + mov(eax, COND0); + mov(ebx, COND1); + xor(eax, (instr.flow_control.refx.Value() ^ 1)); + xor(ebx, (instr.flow_control.refy.Value() ^ 1)); + and(eax, ebx); + break; + + case Instruction::FlowControlType::JustX: + mov(eax, COND0); + xor(eax, (instr.flow_control.refx.Value() ^ 1)); + break; + + case Instruction::FlowControlType::JustY: + mov(eax, COND1); + xor(eax, (instr.flow_control.refy.Value() ^ 1)); + break; + } +} + +void JitShader::Compile_UniformCondition(Instruction instr) { + size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); + cmp(byte[SETUP + offset], 0); +} + +BitSet32 JitShader::PersistentCallerSavedRegs() { + return persistent_regs & ABI_ALL_CALLER_SAVED; +} + +void JitShader::Compile_ADD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + addps(SRC1, SRC2); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_DP3(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + + movaps(SRC2, SRC1); + shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); + + movaps(SRC3, SRC1); + shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); + + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); + addps(SRC1, SRC2); + addps(SRC1, SRC3); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_DP4(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + + movaps(SRC2, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + addps(SRC1, SRC2); + + movaps(SRC2, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + addps(SRC1, SRC2); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_DPH(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + if (Common::GetCPUCaps().sse4_1) { + // Set 4th component to 1.0 + blendps(SRC1, ONE, 0b1000); + } else { + // Set 4th component to 1.0 + movaps(SCRATCH, SRC1); + unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__ + unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1 + } + + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + + movaps(SRC2, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + addps(SRC1, SRC2); + + movaps(SRC2, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + addps(SRC1, SRC2); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_EX2(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + movss(xmm0, SRC1); // ABI_PARAM1 + + ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + CallFarFunction(*this, exp2f); + ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + + shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN + movaps(SRC1, xmm0); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_LG2(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + movss(xmm0, SRC1); // ABI_PARAM1 + + ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + CallFarFunction(*this, log2f); + ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + + shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN + movaps(SRC1, xmm0); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_MUL(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_SGE(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + cmpleps(SRC2, SRC1); + andps(SRC2, ONE); + + Compile_DestEnable(instr, SRC2); +} + +void JitShader::Compile_SLT(Instruction instr) { + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); + } else { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + } + + cmpltps(SRC1, SRC2); + andps(SRC1, ONE); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_FLR(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + if (Common::GetCPUCaps().sse4_1) { + roundps(SRC1, SRC1, _MM_FROUND_FLOOR); + } else { + cvttps2dq(SRC1, SRC1); + cvtdq2ps(SRC1, SRC1); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_MAX(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. + maxps(SRC1, SRC2); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_MIN(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. + minps(SRC1, SRC2); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_MOVA(Instruction instr) { + SwizzlePattern swiz = {(*swizzle_data)[instr.common.operand_desc_id]}; + + if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { + return; // NoOp + } + + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // Convert floats to integers using truncation (only care about X and Y components) + cvttps2dq(SRC1, SRC1); + + // Get result + movq(rax, SRC1); + + // Handle destination enable + if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { + // Move and sign-extend low 32 bits + movsxd(ADDROFFS_REG_0, eax); + + // Move and sign-extend high 32 bits + shr(rax, 32); + movsxd(ADDROFFS_REG_1, eax); + + // Multiply by 16 to be used as an offset later + shl(ADDROFFS_REG_0, 4); + shl(ADDROFFS_REG_1, 4); + } else { + if (swiz.DestComponentEnabled(0)) { + // Move and sign-extend low 32 bits + movsxd(ADDROFFS_REG_0, eax); + + // Multiply by 16 to be used as an offset later + shl(ADDROFFS_REG_0, 4); + } else if (swiz.DestComponentEnabled(1)) { + // Move and sign-extend high 32 bits + shr(rax, 32); + movsxd(ADDROFFS_REG_1, eax); + + // Multiply by 16 to be used as an offset later + shl(ADDROFFS_REG_1, 4); + } + } +} + +void JitShader::Compile_MOV(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_RCP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + rcpss(SRC1, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_RSQ(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + rsqrtss(SRC1, SRC1); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_NOP(Instruction instr) {} + +void JitShader::Compile_END(Instruction instr) { + ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); + ret(); +} + +void JitShader::Compile_CALL(Instruction instr) { + // Push offset of the return + push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions)); + + // Call the subroutine + call(instruction_labels[instr.flow_control.dest_offset]); + + // Skip over the return offset that's on the stack + add(rsp, 8); +} + +void JitShader::Compile_CALLC(Instruction instr) { + Compile_EvaluateCondition(instr); + Label b; + jz(b); + Compile_CALL(instr); + L(b); +} + +void JitShader::Compile_CALLU(Instruction instr) { + Compile_UniformCondition(instr); + Label b; + jz(b); + Compile_CALL(instr); + L(b); +} + +void JitShader::Compile_CMP(Instruction instr) { + using Op = Instruction::Common::CompareOpType::Op; + Op op_x = instr.common.compare_op.x; + Op op_y = instr.common.compare_op.y; + + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to + // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here + // because they don't match when used with NaNs. + static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; + + bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); + Xmm lhs_x = invert_op_x ? SRC2 : SRC1; + Xmm rhs_x = invert_op_x ? SRC1 : SRC2; + + if (op_x == op_y) { + // Compare X-component and Y-component together + cmpps(lhs_x, rhs_x, cmp[op_x]); + movq(COND0, lhs_x); + + mov(COND1, COND0); + } else { + bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); + Xmm lhs_y = invert_op_y ? SRC2 : SRC1; + Xmm rhs_y = invert_op_y ? SRC1 : SRC2; + + // Compare X-component + movaps(SCRATCH, lhs_x); + cmpss(SCRATCH, rhs_x, cmp[op_x]); + + // Compare Y-component + cmpps(lhs_y, rhs_y, cmp[op_y]); + + movq(COND0, SCRATCH); + movq(COND1, lhs_y); + } + + shr(COND0.cvt32(), 31); // ignores upper 32 bits in source + shr(COND1, 63); +} + +void JitShader::Compile_MAD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); + + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); + } else { + Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); + } + + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + addps(SRC1, SRC3); + + Compile_DestEnable(instr, SRC1); +} + +void JitShader::Compile_IF(Instruction instr) { + Compile_Assert(instr.flow_control.dest_offset >= program_counter, + "Backwards if-statements not supported"); + Label l_else, l_endif; + + // Evaluate the "IF" condition + if (instr.opcode.Value() == OpCode::Id::IFU) { + Compile_UniformCondition(instr); + } else if (instr.opcode.Value() == OpCode::Id::IFC) { + Compile_EvaluateCondition(instr); + } + jz(l_else, T_NEAR); + + // Compile the code that corresponds to the condition evaluating as true + Compile_Block(instr.flow_control.dest_offset); + + // If there isn't an "ELSE" condition, we are done here + if (instr.flow_control.num_instructions == 0) { + L(l_else); + return; + } + + jmp(l_endif, T_NEAR); + + L(l_else); + // This code corresponds to the "ELSE" condition + // Comple the code that corresponds to the condition evaluating as false + Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); + + L(l_endif); +} + +void JitShader::Compile_LOOP(Instruction instr) { + Compile_Assert(instr.flow_control.dest_offset >= program_counter, + "Backwards loops not supported"); + Compile_Assert(!looping, "Nested loops not supported"); + + looping = true; + + // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. + // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by + // 4 bits) to be used as an offset into the 16-byte vector registers later + size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id); + mov(LOOPCOUNT, dword[SETUP + offset]); + mov(LOOPCOUNT_REG, LOOPCOUNT); + shr(LOOPCOUNT_REG, 4); + and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start + mov(LOOPINC, LOOPCOUNT); + shr(LOOPINC, 12); + and(LOOPINC, 0xFF0); // Z-component is the incrementer + movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count + add(LOOPCOUNT, 1); // Iteration count is X-component + 1 + + Label l_loop_start; + L(l_loop_start); + + Compile_Block(instr.flow_control.dest_offset + 1); + + add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component + sub(LOOPCOUNT, 1); // Increment loop count by 1 + jnz(l_loop_start); // Loop if not equal + + looping = false; +} + +void JitShader::Compile_JMP(Instruction instr) { + if (instr.opcode.Value() == OpCode::Id::JMPC) + Compile_EvaluateCondition(instr); + else if (instr.opcode.Value() == OpCode::Id::JMPU) + Compile_UniformCondition(instr); + else + UNREACHABLE(); + + bool inverted_condition = + (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); + + Label& b = instruction_labels[instr.flow_control.dest_offset]; + if (inverted_condition) { + jz(b, T_NEAR); + } else { + jnz(b, T_NEAR); + } +} + +void JitShader::Compile_Block(unsigned end) { + while (program_counter < end) { + Compile_NextInstr(); + } +} + +void JitShader::Compile_Return() { + // Peek return offset on the stack and check if we're at that offset + mov(rax, qword[rsp + 8]); + cmp(eax, (program_counter)); + + // If so, jump back to before CALL + Label b; + jnz(b); + ret(); + L(b); +} + +void JitShader::Compile_NextInstr() { + if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { + Compile_Return(); + } + + L(instruction_labels[program_counter]); + + Instruction instr = {(*program_code)[program_counter++]}; + + OpCode::Id opcode = instr.opcode.Value(); + auto instr_func = instr_table[static_cast<unsigned>(opcode)]; + + if (instr_func) { + // JIT the instruction! + ((*this).*instr_func)(instr); + } else { + // Unhandled instruction + LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", + instr.opcode.Value().EffectiveOpCode(), instr.hex); + } +} + +void JitShader::FindReturnOffsets() { + return_offsets.clear(); + + for (size_t offset = 0; offset < program_code->size(); ++offset) { + Instruction instr = {(*program_code)[offset]}; + + switch (instr.opcode.Value()) { + case OpCode::Id::CALL: + case OpCode::Id::CALLC: + case OpCode::Id::CALLU: + return_offsets.push_back(instr.flow_control.dest_offset + + instr.flow_control.num_instructions); + break; + default: + break; + } + } + + // Sort for efficient binary search later + std::sort(return_offsets.begin(), return_offsets.end()); +} + +void JitShader::Compile(const std::array<u32, 1024>* program_code_, + const std::array<u32, 1024>* swizzle_data_) { + program_code = program_code_; + swizzle_data = swizzle_data_; + + // Reset flow control state + program = (CompiledShader*)getCurr(); + program_counter = 0; + looping = false; + instruction_labels.fill(Xbyak::Label()); + + // Find all `CALL` instructions and identify return locations + FindReturnOffsets(); + + // The stack pointer is 8 modulo 16 at the entry of a procedure + ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); + + mov(SETUP, ABI_PARAM1); + mov(STATE, ABI_PARAM2); + + // Zero address/loop registers + xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32()); + xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32()); + xor(LOOPCOUNT_REG, LOOPCOUNT_REG); + + // Used to set a register to one + static const __m128 one = {1.f, 1.f, 1.f, 1.f}; + mov(rax, reinterpret_cast<size_t>(&one)); + movaps(ONE, xword[rax]); + + // Used to negate registers + static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; + mov(rax, reinterpret_cast<size_t>(&neg)); + movaps(NEGBIT, xword[rax]); + + // Jump to start of the shader program + jmp(ABI_PARAM3); + + // Compile entire program + Compile_Block(static_cast<unsigned>(program_code->size())); + + // Free memory that's no longer needed + program_code = nullptr; + swizzle_data = nullptr; + return_offsets.clear(); + return_offsets.shrink_to_fit(); + + ready(); + + ASSERT_MSG(getSize() <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); + LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize()); +} + +JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h new file mode 100644 index 000000000..29e9875ea --- /dev/null +++ b/src/video_core/shader/shader_jit_x64_compiler.h @@ -0,0 +1,125 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <cstddef> +#include <utility> +#include <vector> +#include <nihstro/shader_bytecode.h> +#include <xbyak.h> +#include "common/bit_set.h" +#include "common/common_types.h" +#include "common/x64/emitter.h" +#include "video_core/shader/shader.h" + +using nihstro::Instruction; +using nihstro::OpCode; +using nihstro::SwizzlePattern; + +namespace Pica { + +namespace Shader { + +/// Memory allocated for each compiled shader (64Kb) +constexpr size_t MAX_SHADER_SIZE = 1024 * 64; + +/** + * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 + * code that can be executed on the host machine directly. + */ +class JitShader : public Xbyak::CodeGenerator { +public: + JitShader(); + + void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const { + program(&setup, &state, instruction_labels[offset].getAddress()); + } + + void Compile(const std::array<u32, 1024>* program_code, + const std::array<u32, 1024>* swizzle_data); + + void Compile_ADD(Instruction instr); + void Compile_DP3(Instruction instr); + void Compile_DP4(Instruction instr); + void Compile_DPH(Instruction instr); + void Compile_EX2(Instruction instr); + void Compile_LG2(Instruction instr); + void Compile_MUL(Instruction instr); + void Compile_SGE(Instruction instr); + void Compile_SLT(Instruction instr); + void Compile_FLR(Instruction instr); + void Compile_MAX(Instruction instr); + void Compile_MIN(Instruction instr); + void Compile_RCP(Instruction instr); + void Compile_RSQ(Instruction instr); + void Compile_MOVA(Instruction instr); + void Compile_MOV(Instruction instr); + void Compile_NOP(Instruction instr); + void Compile_END(Instruction instr); + void Compile_CALL(Instruction instr); + void Compile_CALLC(Instruction instr); + void Compile_CALLU(Instruction instr); + void Compile_IF(Instruction instr); + void Compile_LOOP(Instruction instr); + void Compile_JMP(Instruction instr); + void Compile_CMP(Instruction instr); + void Compile_MAD(Instruction instr); + +private: + void Compile_Block(unsigned end); + void Compile_NextInstr(); + + void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, + Xbyak::Xmm dest); + void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest); + + /** + * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying + * zero by inf. Clobbers `src2` and `scratch`. + */ + void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); + + void Compile_EvaluateCondition(Instruction instr); + void Compile_UniformCondition(Instruction instr); + + /** + * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. + */ + void Compile_Return(); + + BitSet32 PersistentCallerSavedRegs(); + + /** + * Assertion evaluated at compile-time, but only triggered if executed at runtime. + * @param msg Message to be logged if the assertion fails. + */ + void Compile_Assert(bool condition, const char* msg); + + /** + * Analyzes the entire shader program for `CALL` instructions before emitting any code, + * identifying the locations where a return needs to be inserted. + */ + void FindReturnOffsets(); + + const std::array<u32, 1024>* program_code = nullptr; + const std::array<u32, 1024>* swizzle_data = nullptr; + + /// Mapping of Pica VS instructions to pointers in the emitted code + std::array<Xbyak::Label, 1024> instruction_labels; + + /// Offsets in code where a return needs to be inserted + std::vector<unsigned> return_offsets; + + unsigned program_counter = 0; ///< Offset of the next instruction to decode + bool looping = false; ///< True if compiling a loop, used to check for nested loops + + using CompiledShader = void(const void* setup, void* state, const u8* start_addr); + CompiledShader* program = nullptr; +}; + +} // Shader + +} // Pica |