diff options
Diffstat (limited to 'src')
75 files changed, 896 insertions, 455 deletions
diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h index 4dfd41b43..978b1518f 100644 --- a/src/core/arm/arm_interface.h +++ b/src/core/arm/arm_interface.h @@ -7,6 +7,10 @@ #include <array> #include "common/common_types.h" +namespace Common { +struct PageTable; +} + namespace Kernel { enum class VMAPermission : u8; } @@ -49,8 +53,14 @@ public: /// Clear all instruction cache virtual void ClearInstructionCache() = 0; - /// Notify CPU emulation that page tables have changed - virtual void PageTableChanged() = 0; + /// Notifies CPU emulation that the current page table has changed. + /// + /// @param new_page_table The new page table. + /// @param new_address_space_size_in_bits The new usable size of the address space in bits. + /// This can be either 32, 36, or 39 on official software. + /// + virtual void PageTableChanged(Common::PageTable& new_page_table, + std::size_t new_address_space_size_in_bits) = 0; /** * Set the Program Counter to an address diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic.cpp index dc96e35d5..44307fa19 100644 --- a/src/core/arm/dynarmic/arm_dynarmic.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic.cpp @@ -14,7 +14,6 @@ #include "core/core_timing.h" #include "core/core_timing_util.h" #include "core/gdbstub/gdbstub.h" -#include "core/hle/kernel/kernel.h" #include "core/hle/kernel/process.h" #include "core/hle/kernel/svc.h" #include "core/hle/kernel/vm_manager.h" @@ -129,18 +128,16 @@ public: u64 tpidr_el0 = 0; }; -std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit() const { - auto* current_process = system.Kernel().CurrentProcess(); - auto** const page_table = current_process->VMManager().page_table.pointers.data(); - +std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit(Common::PageTable& page_table, + std::size_t address_space_bits) const { Dynarmic::A64::UserConfig config; // Callbacks config.callbacks = cb.get(); // Memory - config.page_table = reinterpret_cast<void**>(page_table); - config.page_table_address_space_bits = current_process->VMManager().GetAddressSpaceWidth(); + config.page_table = reinterpret_cast<void**>(page_table.pointers.data()); + config.page_table_address_space_bits = address_space_bits; config.silently_mirror_page_table = false; // Multi-process state @@ -176,12 +173,7 @@ ARM_Dynarmic::ARM_Dynarmic(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index) : cb(std::make_unique<ARM_Dynarmic_Callbacks>(*this)), inner_unicorn{system}, core_index{core_index}, system{system}, - exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} { - ThreadContext ctx{}; - inner_unicorn.SaveContext(ctx); - PageTableChanged(); - LoadContext(ctx); -} + exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {} ARM_Dynarmic::~ARM_Dynarmic() = default; @@ -276,8 +268,9 @@ void ARM_Dynarmic::ClearExclusiveState() { jit->ClearExclusiveState(); } -void ARM_Dynarmic::PageTableChanged() { - jit = MakeJit(); +void ARM_Dynarmic::PageTableChanged(Common::PageTable& page_table, + std::size_t new_address_space_size_in_bits) { + jit = MakeJit(page_table, new_address_space_size_in_bits); } DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(std::size_t core_count) : monitor(core_count) {} diff --git a/src/core/arm/dynarmic/arm_dynarmic.h b/src/core/arm/dynarmic/arm_dynarmic.h index c1db254e8..b701e97a3 100644 --- a/src/core/arm/dynarmic/arm_dynarmic.h +++ b/src/core/arm/dynarmic/arm_dynarmic.h @@ -48,10 +48,12 @@ public: void ClearExclusiveState() override; void ClearInstructionCache() override; - void PageTableChanged() override; + void PageTableChanged(Common::PageTable& new_page_table, + std::size_t new_address_space_size_in_bits) override; private: - std::unique_ptr<Dynarmic::A64::Jit> MakeJit() const; + std::unique_ptr<Dynarmic::A64::Jit> MakeJit(Common::PageTable& page_table, + std::size_t address_space_bits) const; friend class ARM_Dynarmic_Callbacks; std::unique_ptr<ARM_Dynarmic_Callbacks> cb; diff --git a/src/core/arm/unicorn/arm_unicorn.h b/src/core/arm/unicorn/arm_unicorn.h index 209fc16ad..34e974b4d 100644 --- a/src/core/arm/unicorn/arm_unicorn.h +++ b/src/core/arm/unicorn/arm_unicorn.h @@ -41,7 +41,7 @@ public: void Run() override; void Step() override; void ClearInstructionCache() override; - void PageTableChanged() override{}; + void PageTableChanged(Common::PageTable&, std::size_t) override {} void RecordBreak(GDBStub::BreakpointAddress bkpt); private: diff --git a/src/core/core.cpp b/src/core/core.cpp index bc9e887b6..175a5f2ea 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -3,9 +3,7 @@ // Refer to the license.txt file included. #include <array> -#include <map> #include <memory> -#include <thread> #include <utility> #include "common/file_util.h" @@ -38,8 +36,6 @@ #include "frontend/applets/software_keyboard.h" #include "frontend/applets/web_browser.h" #include "video_core/debug_utils/debug_utils.h" -#include "video_core/gpu_asynch.h" -#include "video_core/gpu_synch.h" #include "video_core/renderer_base.h" #include "video_core/video_core.h" @@ -81,7 +77,7 @@ FileSys::VirtualFile GetGameFileFromPath(const FileSys::VirtualFilesystem& vfs, return vfs->OpenFile(path, FileSys::Mode::Read); } struct System::Impl { - explicit Impl(System& system) : kernel{system} {} + explicit Impl(System& system) : kernel{system}, cpu_core_manager{system} {} Cpu& CurrentCpuCore() { return cpu_core_manager.GetCurrentCore(); @@ -99,6 +95,7 @@ struct System::Impl { LOG_DEBUG(HW_Memory, "initialized OK"); core_timing.Initialize(); + cpu_core_manager.Initialize(); kernel.Initialize(); const auto current_time = std::chrono::duration_cast<std::chrono::seconds>( @@ -120,9 +117,6 @@ struct System::Impl { if (web_browser == nullptr) web_browser = std::make_unique<Core::Frontend::DefaultWebBrowserApplet>(); - auto main_process = Kernel::Process::Create(system, "main"); - kernel.MakeCurrentProcess(main_process.get()); - telemetry_session = std::make_unique<Core::TelemetrySession>(); service_manager = std::make_shared<Service::SM::ServiceManager>(); @@ -134,15 +128,9 @@ struct System::Impl { return ResultStatus::ErrorVideoCore; } - is_powered_on = true; - - if (Settings::values.use_asynchronous_gpu_emulation) { - gpu_core = std::make_unique<VideoCommon::GPUAsynch>(system, *renderer); - } else { - gpu_core = std::make_unique<VideoCommon::GPUSynch>(system, *renderer); - } + gpu_core = VideoCore::CreateGPU(system); - cpu_core_manager.Initialize(system); + is_powered_on = true; LOG_DEBUG(Core, "Initialized OK"); @@ -179,7 +167,8 @@ struct System::Impl { return init_result; } - const Loader::ResultStatus load_result{app_loader->Load(*kernel.CurrentProcess())}; + auto main_process = Kernel::Process::Create(system, "main"); + const auto [load_result, load_parameters] = app_loader->Load(*main_process); if (load_result != Loader::ResultStatus::Success) { LOG_CRITICAL(Core, "Failed to load ROM (Error {})!", static_cast<int>(load_result)); Shutdown(); @@ -187,6 +176,16 @@ struct System::Impl { return static_cast<ResultStatus>(static_cast<u32>(ResultStatus::ErrorLoader) + static_cast<u32>(load_result)); } + kernel.MakeCurrentProcess(main_process.get()); + + // Main process has been loaded and been made current. + // Begin GPU and CPU execution. + gpu_core->Start(); + cpu_core_manager.StartThreads(); + + // All threads are started, begin main process execution, now that we're in the clear. + main_process->Run(load_parameters->main_thread_priority, + load_parameters->main_thread_stack_size); status = ResultStatus::Success; return status; diff --git a/src/core/cpu_core_manager.cpp b/src/core/cpu_core_manager.cpp index 93bc5619c..8fcb4eeb1 100644 --- a/src/core/cpu_core_manager.cpp +++ b/src/core/cpu_core_manager.cpp @@ -19,17 +19,19 @@ void RunCpuCore(const System& system, Cpu& cpu_state) { } } // Anonymous namespace -CpuCoreManager::CpuCoreManager() = default; +CpuCoreManager::CpuCoreManager(System& system) : system{system} {} CpuCoreManager::~CpuCoreManager() = default; -void CpuCoreManager::Initialize(System& system) { +void CpuCoreManager::Initialize() { barrier = std::make_unique<CpuBarrier>(); exclusive_monitor = Cpu::MakeExclusiveMonitor(cores.size()); for (std::size_t index = 0; index < cores.size(); ++index) { cores[index] = std::make_unique<Cpu>(system, *exclusive_monitor, *barrier, index); } +} +void CpuCoreManager::StartThreads() { // Create threads for CPU cores 1-3, and build thread_to_cpu map // CPU core 0 is run on the main thread thread_to_cpu[std::this_thread::get_id()] = cores[0].get(); diff --git a/src/core/cpu_core_manager.h b/src/core/cpu_core_manager.h index a4d70ec56..2cbbf8216 100644 --- a/src/core/cpu_core_manager.h +++ b/src/core/cpu_core_manager.h @@ -18,7 +18,7 @@ class System; class CpuCoreManager { public: - CpuCoreManager(); + explicit CpuCoreManager(System& system); CpuCoreManager(const CpuCoreManager&) = delete; CpuCoreManager(CpuCoreManager&&) = delete; @@ -27,7 +27,8 @@ public: CpuCoreManager& operator=(const CpuCoreManager&) = delete; CpuCoreManager& operator=(CpuCoreManager&&) = delete; - void Initialize(System& system); + void Initialize(); + void StartThreads(); void Shutdown(); Cpu& GetCore(std::size_t index); @@ -54,6 +55,8 @@ private: /// Map of guest threads to CPU cores std::map<std::thread::id, Cpu*> thread_to_cpu; + + System& system; }; } // namespace Core diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp index 4d58e7c69..8539fabe4 100644 --- a/src/core/hle/kernel/kernel.cpp +++ b/src/core/hle/kernel/kernel.cpp @@ -182,7 +182,12 @@ void KernelCore::AppendNewProcess(SharedPtr<Process> process) { void KernelCore::MakeCurrentProcess(Process* process) { impl->current_process = process; - Memory::SetCurrentPageTable(&process->VMManager().page_table); + + if (process == nullptr) { + return; + } + + Memory::SetCurrentPageTable(*process); } Process* KernelCore::CurrentProcess() { diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index 8b2b3877d..6d7a7e754 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -28,12 +28,12 @@ namespace { * * @param owner_process The parent process for the main thread * @param kernel The kernel instance to create the main thread under. - * @param entry_point The address at which the thread should start execution * @param priority The priority to give the main thread */ -void SetupMainThread(Process& owner_process, KernelCore& kernel, VAddr entry_point, u32 priority) { - // Initialize new "main" thread - const VAddr stack_top = owner_process.VMManager().GetTLSIORegionEndAddress(); +void SetupMainThread(Process& owner_process, KernelCore& kernel, u32 priority) { + const auto& vm_manager = owner_process.VMManager(); + const VAddr entry_point = vm_manager.GetCodeRegionBaseAddress(); + const VAddr stack_top = vm_manager.GetTLSIORegionEndAddress(); auto thread_res = Thread::Create(kernel, "main", entry_point, priority, 0, owner_process.GetIdealCore(), stack_top, owner_process); @@ -105,8 +105,6 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) { is_64bit_process = metadata.Is64BitProgram(); vm_manager.Reset(metadata.GetAddressSpaceType()); - // Ensure that the potentially resized page table is seen by CPU backends. - Memory::SetCurrentPageTable(&vm_manager.page_table); const auto& caps = metadata.GetKernelCapabilities(); const auto capability_init_result = @@ -118,7 +116,7 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) { return handle_table.SetSize(capabilities.GetHandleTableSize()); } -void Process::Run(VAddr entry_point, s32 main_thread_priority, u64 stack_size) { +void Process::Run(s32 main_thread_priority, u64 stack_size) { // The kernel always ensures that the given stack size is page aligned. main_thread_stack_size = Common::AlignUp(stack_size, Memory::PAGE_SIZE); @@ -134,7 +132,7 @@ void Process::Run(VAddr entry_point, s32 main_thread_priority, u64 stack_size) { vm_manager.LogLayout(); ChangeStatus(ProcessStatus::Running); - SetupMainThread(*this, kernel, entry_point, main_thread_priority); + SetupMainThread(*this, kernel, main_thread_priority); } void Process::PrepareForTermination() { @@ -241,9 +239,6 @@ void Process::LoadModule(CodeSet module_, VAddr base_addr) { MapSegment(module_.DataSegment(), VMAPermission::ReadWrite, MemoryState::CodeData); code_memory_size += module_.memory.size(); - - // Clear instruction cache in CPU JIT - system.InvalidateCpuInstructionCaches(); } Process::Process(Core::System& system) diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h index dda52f4c0..bf3b7eef3 100644 --- a/src/core/hle/kernel/process.h +++ b/src/core/hle/kernel/process.h @@ -225,9 +225,12 @@ public: ResultCode LoadFromMetadata(const FileSys::ProgramMetadata& metadata); /** - * Applies address space changes and launches the process main thread. + * Starts the main application thread for this process. + * + * @param main_thread_priority The priority for the main thread. + * @param stack_size The stack size for the main thread in bytes. */ - void Run(VAddr entry_point, s32 main_thread_priority, u64 stack_size); + void Run(s32 main_thread_priority, u64 stack_size); /** * Prepares a process for termination by stopping all of its threads diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp index 4eeb97bef..4c763b288 100644 --- a/src/core/hle/kernel/svc.cpp +++ b/src/core/hle/kernel/svc.cpp @@ -2290,7 +2290,7 @@ static const FunctionDef SVC_Table[] = { {0x33, SvcWrap<GetThreadContext>, "GetThreadContext"}, {0x34, SvcWrap<WaitForAddress>, "WaitForAddress"}, {0x35, SvcWrap<SignalToAddress>, "SignalToAddress"}, - {0x36, nullptr, "Unknown"}, + {0x36, nullptr, "SynchronizePreemptionState"}, {0x37, nullptr, "Unknown"}, {0x38, nullptr, "Unknown"}, {0x39, nullptr, "Unknown"}, diff --git a/src/core/hle/kernel/wait_object.cpp b/src/core/hle/kernel/wait_object.cpp index 90580ed93..c8eaf9488 100644 --- a/src/core/hle/kernel/wait_object.cpp +++ b/src/core/hle/kernel/wait_object.cpp @@ -30,7 +30,7 @@ void WaitObject::RemoveWaitingThread(Thread* thread) { waiting_threads.erase(itr); } -SharedPtr<Thread> WaitObject::GetHighestPriorityReadyThread() { +SharedPtr<Thread> WaitObject::GetHighestPriorityReadyThread() const { Thread* candidate = nullptr; u32 candidate_priority = THREADPRIO_LOWEST + 1; diff --git a/src/core/hle/kernel/wait_object.h b/src/core/hle/kernel/wait_object.h index 04464a51a..3271a30a7 100644 --- a/src/core/hle/kernel/wait_object.h +++ b/src/core/hle/kernel/wait_object.h @@ -54,7 +54,7 @@ public: void WakeupWaitingThread(SharedPtr<Thread> thread); /// Obtains the highest priority thread that is ready to run from this object's waiting list. - SharedPtr<Thread> GetHighestPriorityReadyThread(); + SharedPtr<Thread> GetHighestPriorityReadyThread() const; /// Get a const reference to the waiting threads list for debug use const std::vector<SharedPtr<Thread>>& GetWaitingThreads() const; diff --git a/src/core/hle/service/audio/audctl.cpp b/src/core/hle/service/audio/audctl.cpp index b6b71f966..f43e512e9 100644 --- a/src/core/hle/service/audio/audctl.cpp +++ b/src/core/hle/service/audio/audctl.cpp @@ -2,6 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/logging/log.h" +#include "core/hle/ipc_helpers.h" #include "core/hle/service/audio/audctl.h" namespace Service::Audio { @@ -11,8 +13,8 @@ AudCtl::AudCtl() : ServiceFramework{"audctl"} { static const FunctionInfo functions[] = { {0, nullptr, "GetTargetVolume"}, {1, nullptr, "SetTargetVolume"}, - {2, nullptr, "GetTargetVolumeMin"}, - {3, nullptr, "GetTargetVolumeMax"}, + {2, &AudCtl::GetTargetVolumeMin, "GetTargetVolumeMin"}, + {3, &AudCtl::GetTargetVolumeMax, "GetTargetVolumeMax"}, {4, nullptr, "IsTargetMute"}, {5, nullptr, "SetTargetMute"}, {6, nullptr, "IsTargetConnected"}, @@ -44,4 +46,28 @@ AudCtl::AudCtl() : ServiceFramework{"audctl"} { AudCtl::~AudCtl() = default; +void AudCtl::GetTargetVolumeMin(Kernel::HLERequestContext& ctx) { + LOG_DEBUG(Audio, "called."); + + // This service function is currently hardcoded on the + // actual console to this value (as of 6.0.0). + constexpr s32 target_min_volume = 0; + + IPC::ResponseBuilder rb{ctx, 3}; + rb.Push(RESULT_SUCCESS); + rb.Push(target_min_volume); +} + +void AudCtl::GetTargetVolumeMax(Kernel::HLERequestContext& ctx) { + LOG_DEBUG(Audio, "called."); + + // This service function is currently hardcoded on the + // actual console to this value (as of 6.0.0). + constexpr s32 target_max_volume = 15; + + IPC::ResponseBuilder rb{ctx, 3}; + rb.Push(RESULT_SUCCESS); + rb.Push(target_max_volume); +} + } // namespace Service::Audio diff --git a/src/core/hle/service/audio/audctl.h b/src/core/hle/service/audio/audctl.h index 9d2d9e83b..c7fafc02e 100644 --- a/src/core/hle/service/audio/audctl.h +++ b/src/core/hle/service/audio/audctl.h @@ -12,6 +12,10 @@ class AudCtl final : public ServiceFramework<AudCtl> { public: explicit AudCtl(); ~AudCtl() override; + +private: + void GetTargetVolumeMin(Kernel::HLERequestContext& ctx); + void GetTargetVolumeMax(Kernel::HLERequestContext& ctx); }; } // namespace Service::Audio diff --git a/src/core/loader/deconstructed_rom_directory.cpp b/src/core/loader/deconstructed_rom_directory.cpp index 07aa7a1cd..10b13fb1d 100644 --- a/src/core/loader/deconstructed_rom_directory.cpp +++ b/src/core/loader/deconstructed_rom_directory.cpp @@ -86,25 +86,29 @@ FileType AppLoader_DeconstructedRomDirectory::IdentifyType(const FileSys::Virtua return FileType::Error; } -ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process) { +AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirectory::Load( + Kernel::Process& process) { if (is_loaded) { - return ResultStatus::ErrorAlreadyLoaded; + return {ResultStatus::ErrorAlreadyLoaded, {}}; } if (dir == nullptr) { - if (file == nullptr) - return ResultStatus::ErrorNullFile; + if (file == nullptr) { + return {ResultStatus::ErrorNullFile, {}}; + } + dir = file->GetContainingDirectory(); } // Read meta to determine title ID FileSys::VirtualFile npdm = dir->GetFile("main.npdm"); - if (npdm == nullptr) - return ResultStatus::ErrorMissingNPDM; + if (npdm == nullptr) { + return {ResultStatus::ErrorMissingNPDM, {}}; + } - ResultStatus result = metadata.Load(npdm); + const ResultStatus result = metadata.Load(npdm); if (result != ResultStatus::Success) { - return result; + return {result, {}}; } if (override_update) { @@ -114,23 +118,24 @@ ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process) // Reread in case PatchExeFS affected the main.npdm npdm = dir->GetFile("main.npdm"); - if (npdm == nullptr) - return ResultStatus::ErrorMissingNPDM; + if (npdm == nullptr) { + return {ResultStatus::ErrorMissingNPDM, {}}; + } - ResultStatus result2 = metadata.Load(npdm); + const ResultStatus result2 = metadata.Load(npdm); if (result2 != ResultStatus::Success) { - return result2; + return {result2, {}}; } metadata.Print(); const FileSys::ProgramAddressSpaceType arch_bits{metadata.GetAddressSpaceType()}; if (arch_bits == FileSys::ProgramAddressSpaceType::Is32Bit || arch_bits == FileSys::ProgramAddressSpaceType::Is32BitNoMap) { - return ResultStatus::Error32BitISA; + return {ResultStatus::Error32BitISA, {}}; } if (process.LoadFromMetadata(metadata).IsError()) { - return ResultStatus::ErrorUnableToParseKernelMetadata; + return {ResultStatus::ErrorUnableToParseKernelMetadata, {}}; } const FileSys::PatchManager pm(metadata.GetTitleID()); @@ -150,7 +155,7 @@ ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process) const auto tentative_next_load_addr = AppLoader_NSO::LoadModule(process, *module_file, load_addr, should_pass_arguments, pm); if (!tentative_next_load_addr) { - return ResultStatus::ErrorLoadingNSO; + return {ResultStatus::ErrorLoadingNSO, {}}; } next_load_addr = *tentative_next_load_addr; @@ -159,8 +164,6 @@ ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process) GDBStub::RegisterModule(module, load_addr, next_load_addr - 1, false); } - process.Run(base_address, metadata.GetMainThreadPriority(), metadata.GetMainThreadStackSize()); - // Find the RomFS by searching for a ".romfs" file in this directory const auto& files = dir->GetFiles(); const auto romfs_iter = @@ -175,7 +178,8 @@ ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process) } is_loaded = true; - return ResultStatus::Success; + return {ResultStatus::Success, + LoadParameters{metadata.GetMainThreadPriority(), metadata.GetMainThreadStackSize()}}; } ResultStatus AppLoader_DeconstructedRomDirectory::ReadRomFS(FileSys::VirtualFile& dir) { diff --git a/src/core/loader/deconstructed_rom_directory.h b/src/core/loader/deconstructed_rom_directory.h index 1615cb5a8..1a65c16a4 100644 --- a/src/core/loader/deconstructed_rom_directory.h +++ b/src/core/loader/deconstructed_rom_directory.h @@ -37,7 +37,7 @@ public: return IdentifyType(file); } - ResultStatus Load(Kernel::Process& process) override; + LoadResult Load(Kernel::Process& process) override; ResultStatus ReadRomFS(FileSys::VirtualFile& dir) override; ResultStatus ReadIcon(std::vector<u8>& buffer) override; diff --git a/src/core/loader/elf.cpp b/src/core/loader/elf.cpp index 46ac372f6..6d4b02375 100644 --- a/src/core/loader/elf.cpp +++ b/src/core/loader/elf.cpp @@ -382,13 +382,15 @@ FileType AppLoader_ELF::IdentifyType(const FileSys::VirtualFile& file) { return FileType::Error; } -ResultStatus AppLoader_ELF::Load(Kernel::Process& process) { - if (is_loaded) - return ResultStatus::ErrorAlreadyLoaded; +AppLoader_ELF::LoadResult AppLoader_ELF::Load(Kernel::Process& process) { + if (is_loaded) { + return {ResultStatus::ErrorAlreadyLoaded, {}}; + } std::vector<u8> buffer = file->ReadAllBytes(); - if (buffer.size() != file->GetSize()) - return ResultStatus::ErrorIncorrectELFFileSize; + if (buffer.size() != file->GetSize()) { + return {ResultStatus::ErrorIncorrectELFFileSize, {}}; + } const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress(); ElfReader elf_reader(&buffer[0]); @@ -396,10 +398,9 @@ ResultStatus AppLoader_ELF::Load(Kernel::Process& process) { const VAddr entry_point = codeset.entrypoint; process.LoadModule(std::move(codeset), entry_point); - process.Run(entry_point, 48, Memory::DEFAULT_STACK_SIZE); is_loaded = true; - return ResultStatus::Success; + return {ResultStatus::Success, LoadParameters{48, Memory::DEFAULT_STACK_SIZE}}; } } // namespace Loader diff --git a/src/core/loader/elf.h b/src/core/loader/elf.h index a2d33021c..7ef7770a6 100644 --- a/src/core/loader/elf.h +++ b/src/core/loader/elf.h @@ -26,7 +26,7 @@ public: return IdentifyType(file); } - ResultStatus Load(Kernel::Process& process) override; + LoadResult Load(Kernel::Process& process) override; }; } // namespace Loader diff --git a/src/core/loader/loader.h b/src/core/loader/loader.h index bb925f4a6..f7846db52 100644 --- a/src/core/loader/loader.h +++ b/src/core/loader/loader.h @@ -131,6 +131,12 @@ std::ostream& operator<<(std::ostream& os, ResultStatus status); /// Interface for loading an application class AppLoader : NonCopyable { public: + struct LoadParameters { + s32 main_thread_priority; + u64 main_thread_stack_size; + }; + using LoadResult = std::pair<ResultStatus, std::optional<LoadParameters>>; + explicit AppLoader(FileSys::VirtualFile file); virtual ~AppLoader(); @@ -145,7 +151,7 @@ public: * @param process The newly created process. * @return The status result of the operation. */ - virtual ResultStatus Load(Kernel::Process& process) = 0; + virtual LoadResult Load(Kernel::Process& process) = 0; /** * Loads the system mode that this application needs. diff --git a/src/core/loader/nax.cpp b/src/core/loader/nax.cpp index 93a970d10..34efef09a 100644 --- a/src/core/loader/nax.cpp +++ b/src/core/loader/nax.cpp @@ -41,31 +41,37 @@ FileType AppLoader_NAX::GetFileType() const { return IdentifyTypeImpl(*nax); } -ResultStatus AppLoader_NAX::Load(Kernel::Process& process) { +AppLoader_NAX::LoadResult AppLoader_NAX::Load(Kernel::Process& process) { if (is_loaded) { - return ResultStatus::ErrorAlreadyLoaded; + return {ResultStatus::ErrorAlreadyLoaded, {}}; } - if (nax->GetStatus() != ResultStatus::Success) - return nax->GetStatus(); + const auto nax_status = nax->GetStatus(); + if (nax_status != ResultStatus::Success) { + return {nax_status, {}}; + } const auto nca = nax->AsNCA(); if (nca == nullptr) { - if (!Core::Crypto::KeyManager::KeyFileExists(false)) - return ResultStatus::ErrorMissingProductionKeyFile; - return ResultStatus::ErrorNAXInconvertibleToNCA; + if (!Core::Crypto::KeyManager::KeyFileExists(false)) { + return {ResultStatus::ErrorMissingProductionKeyFile, {}}; + } + + return {ResultStatus::ErrorNAXInconvertibleToNCA, {}}; } - if (nca->GetStatus() != ResultStatus::Success) - return nca->GetStatus(); + const auto nca_status = nca->GetStatus(); + if (nca_status != ResultStatus::Success) { + return {nca_status, {}}; + } const auto result = nca_loader->Load(process); - if (result != ResultStatus::Success) + if (result.first != ResultStatus::Success) { return result; + } is_loaded = true; - - return ResultStatus::Success; + return result; } ResultStatus AppLoader_NAX::ReadRomFS(FileSys::VirtualFile& dir) { diff --git a/src/core/loader/nax.h b/src/core/loader/nax.h index f40079574..00f1659c1 100644 --- a/src/core/loader/nax.h +++ b/src/core/loader/nax.h @@ -33,7 +33,7 @@ public: FileType GetFileType() const override; - ResultStatus Load(Kernel::Process& process) override; + LoadResult Load(Kernel::Process& process) override; ResultStatus ReadRomFS(FileSys::VirtualFile& dir) override; u64 ReadRomFSIVFCOffset() const override; diff --git a/src/core/loader/nca.cpp b/src/core/loader/nca.cpp index ce8196fcf..b3f8f1083 100644 --- a/src/core/loader/nca.cpp +++ b/src/core/loader/nca.cpp @@ -30,36 +30,38 @@ FileType AppLoader_NCA::IdentifyType(const FileSys::VirtualFile& file) { return FileType::Error; } -ResultStatus AppLoader_NCA::Load(Kernel::Process& process) { +AppLoader_NCA::LoadResult AppLoader_NCA::Load(Kernel::Process& process) { if (is_loaded) { - return ResultStatus::ErrorAlreadyLoaded; + return {ResultStatus::ErrorAlreadyLoaded, {}}; } const auto result = nca->GetStatus(); if (result != ResultStatus::Success) { - return result; + return {result, {}}; } - if (nca->GetType() != FileSys::NCAContentType::Program) - return ResultStatus::ErrorNCANotProgram; + if (nca->GetType() != FileSys::NCAContentType::Program) { + return {ResultStatus::ErrorNCANotProgram, {}}; + } const auto exefs = nca->GetExeFS(); - - if (exefs == nullptr) - return ResultStatus::ErrorNoExeFS; + if (exefs == nullptr) { + return {ResultStatus::ErrorNoExeFS, {}}; + } directory_loader = std::make_unique<AppLoader_DeconstructedRomDirectory>(exefs, true); const auto load_result = directory_loader->Load(process); - if (load_result != ResultStatus::Success) + if (load_result.first != ResultStatus::Success) { return load_result; + } - if (nca->GetRomFS() != nullptr && nca->GetRomFS()->GetSize() > 0) + if (nca->GetRomFS() != nullptr && nca->GetRomFS()->GetSize() > 0) { Service::FileSystem::RegisterRomFS(std::make_unique<FileSys::RomFSFactory>(*this)); + } is_loaded = true; - - return ResultStatus::Success; + return load_result; } ResultStatus AppLoader_NCA::ReadRomFS(FileSys::VirtualFile& dir) { diff --git a/src/core/loader/nca.h b/src/core/loader/nca.h index b9f077468..94f0ed677 100644 --- a/src/core/loader/nca.h +++ b/src/core/loader/nca.h @@ -33,7 +33,7 @@ public: return IdentifyType(file); } - ResultStatus Load(Kernel::Process& process) override; + LoadResult Load(Kernel::Process& process) override; ResultStatus ReadRomFS(FileSys::VirtualFile& dir) override; u64 ReadRomFSIVFCOffset() const override; diff --git a/src/core/loader/nro.cpp b/src/core/loader/nro.cpp index 31e4a0c84..6a0ca389b 100644 --- a/src/core/loader/nro.cpp +++ b/src/core/loader/nro.cpp @@ -201,25 +201,25 @@ bool AppLoader_NRO::LoadNro(Kernel::Process& process, const FileSys::VfsFile& fi return LoadNroImpl(process, file.ReadAllBytes(), file.GetName(), load_base); } -ResultStatus AppLoader_NRO::Load(Kernel::Process& process) { +AppLoader_NRO::LoadResult AppLoader_NRO::Load(Kernel::Process& process) { if (is_loaded) { - return ResultStatus::ErrorAlreadyLoaded; + return {ResultStatus::ErrorAlreadyLoaded, {}}; } // Load NRO const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress(); if (!LoadNro(process, *file, base_address)) { - return ResultStatus::ErrorLoadingNRO; + return {ResultStatus::ErrorLoadingNRO, {}}; } - if (romfs != nullptr) + if (romfs != nullptr) { Service::FileSystem::RegisterRomFS(std::make_unique<FileSys::RomFSFactory>(*this)); - - process.Run(base_address, Kernel::THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE); + } is_loaded = true; - return ResultStatus::Success; + return {ResultStatus::Success, + LoadParameters{Kernel::THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE}}; } ResultStatus AppLoader_NRO::ReadIcon(std::vector<u8>& buffer) { diff --git a/src/core/loader/nro.h b/src/core/loader/nro.h index 85b0ed644..1ffdae805 100644 --- a/src/core/loader/nro.h +++ b/src/core/loader/nro.h @@ -37,7 +37,7 @@ public: return IdentifyType(file); } - ResultStatus Load(Kernel::Process& process) override; + LoadResult Load(Kernel::Process& process) override; ResultStatus ReadIcon(std::vector<u8>& buffer) override; ResultStatus ReadProgramId(u64& out_program_id) override; diff --git a/src/core/loader/nso.cpp b/src/core/loader/nso.cpp index d7c47c197..a86653204 100644 --- a/src/core/loader/nso.cpp +++ b/src/core/loader/nso.cpp @@ -169,22 +169,21 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::Process& process, return load_base + image_size; } -ResultStatus AppLoader_NSO::Load(Kernel::Process& process) { +AppLoader_NSO::LoadResult AppLoader_NSO::Load(Kernel::Process& process) { if (is_loaded) { - return ResultStatus::ErrorAlreadyLoaded; + return {ResultStatus::ErrorAlreadyLoaded, {}}; } // Load module const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress(); if (!LoadModule(process, *file, base_address, true)) { - return ResultStatus::ErrorLoadingNSO; + return {ResultStatus::ErrorLoadingNSO, {}}; } LOG_DEBUG(Loader, "loaded module {} @ 0x{:X}", file->GetName(), base_address); - process.Run(base_address, Kernel::THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE); - is_loaded = true; - return ResultStatus::Success; + return {ResultStatus::Success, + LoadParameters{Kernel::THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE}}; } } // namespace Loader diff --git a/src/core/loader/nso.h b/src/core/loader/nso.h index 4674c3724..fdce9191c 100644 --- a/src/core/loader/nso.h +++ b/src/core/loader/nso.h @@ -84,7 +84,7 @@ public: VAddr load_base, bool should_pass_arguments, std::optional<FileSys::PatchManager> pm = {}); - ResultStatus Load(Kernel::Process& process) override; + LoadResult Load(Kernel::Process& process) override; }; } // namespace Loader diff --git a/src/core/loader/nsp.cpp b/src/core/loader/nsp.cpp index 7da1f8960..ad56bbb38 100644 --- a/src/core/loader/nsp.cpp +++ b/src/core/loader/nsp.cpp @@ -72,37 +72,45 @@ FileType AppLoader_NSP::IdentifyType(const FileSys::VirtualFile& file) { return FileType::Error; } -ResultStatus AppLoader_NSP::Load(Kernel::Process& process) { +AppLoader_NSP::LoadResult AppLoader_NSP::Load(Kernel::Process& process) { if (is_loaded) { - return ResultStatus::ErrorAlreadyLoaded; + return {ResultStatus::ErrorAlreadyLoaded, {}}; } - if (title_id == 0) - return ResultStatus::ErrorNSPMissingProgramNCA; + if (title_id == 0) { + return {ResultStatus::ErrorNSPMissingProgramNCA, {}}; + } - if (nsp->GetStatus() != ResultStatus::Success) - return nsp->GetStatus(); + const auto nsp_status = nsp->GetStatus(); + if (nsp_status != ResultStatus::Success) { + return {nsp_status, {}}; + } - if (nsp->GetProgramStatus(title_id) != ResultStatus::Success) - return nsp->GetProgramStatus(title_id); + const auto nsp_program_status = nsp->GetProgramStatus(title_id); + if (nsp_program_status != ResultStatus::Success) { + return {nsp_program_status, {}}; + } if (nsp->GetNCA(title_id, FileSys::ContentRecordType::Program) == nullptr) { - if (!Core::Crypto::KeyManager::KeyFileExists(false)) - return ResultStatus::ErrorMissingProductionKeyFile; - return ResultStatus::ErrorNSPMissingProgramNCA; + if (!Core::Crypto::KeyManager::KeyFileExists(false)) { + return {ResultStatus::ErrorMissingProductionKeyFile, {}}; + } + + return {ResultStatus::ErrorNSPMissingProgramNCA, {}}; } const auto result = secondary_loader->Load(process); - if (result != ResultStatus::Success) + if (result.first != ResultStatus::Success) { return result; + } FileSys::VirtualFile update_raw; - if (ReadUpdateRaw(update_raw) == ResultStatus::Success && update_raw != nullptr) + if (ReadUpdateRaw(update_raw) == ResultStatus::Success && update_raw != nullptr) { Service::FileSystem::SetPackedUpdate(std::move(update_raw)); + } is_loaded = true; - - return ResultStatus::Success; + return result; } ResultStatus AppLoader_NSP::ReadRomFS(FileSys::VirtualFile& file) { diff --git a/src/core/loader/nsp.h b/src/core/loader/nsp.h index 953a1b508..85e870bdf 100644 --- a/src/core/loader/nsp.h +++ b/src/core/loader/nsp.h @@ -35,7 +35,7 @@ public: return IdentifyType(file); } - ResultStatus Load(Kernel::Process& process) override; + LoadResult Load(Kernel::Process& process) override; ResultStatus ReadRomFS(FileSys::VirtualFile& file) override; u64 ReadRomFSIVFCOffset() const override; diff --git a/src/core/loader/xci.cpp b/src/core/loader/xci.cpp index 89f7bbf77..1e285a053 100644 --- a/src/core/loader/xci.cpp +++ b/src/core/loader/xci.cpp @@ -48,31 +48,35 @@ FileType AppLoader_XCI::IdentifyType(const FileSys::VirtualFile& file) { return FileType::Error; } -ResultStatus AppLoader_XCI::Load(Kernel::Process& process) { +AppLoader_XCI::LoadResult AppLoader_XCI::Load(Kernel::Process& process) { if (is_loaded) { - return ResultStatus::ErrorAlreadyLoaded; + return {ResultStatus::ErrorAlreadyLoaded, {}}; } - if (xci->GetStatus() != ResultStatus::Success) - return xci->GetStatus(); + if (xci->GetStatus() != ResultStatus::Success) { + return {xci->GetStatus(), {}}; + } - if (xci->GetProgramNCAStatus() != ResultStatus::Success) - return xci->GetProgramNCAStatus(); + if (xci->GetProgramNCAStatus() != ResultStatus::Success) { + return {xci->GetProgramNCAStatus(), {}}; + } - if (!xci->HasProgramNCA() && !Core::Crypto::KeyManager::KeyFileExists(false)) - return ResultStatus::ErrorMissingProductionKeyFile; + if (!xci->HasProgramNCA() && !Core::Crypto::KeyManager::KeyFileExists(false)) { + return {ResultStatus::ErrorMissingProductionKeyFile, {}}; + } const auto result = nca_loader->Load(process); - if (result != ResultStatus::Success) + if (result.first != ResultStatus::Success) { return result; + } FileSys::VirtualFile update_raw; - if (ReadUpdateRaw(update_raw) == ResultStatus::Success && update_raw != nullptr) + if (ReadUpdateRaw(update_raw) == ResultStatus::Success && update_raw != nullptr) { Service::FileSystem::SetPackedUpdate(std::move(update_raw)); + } is_loaded = true; - - return ResultStatus::Success; + return result; } ResultStatus AppLoader_XCI::ReadRomFS(FileSys::VirtualFile& file) { diff --git a/src/core/loader/xci.h b/src/core/loader/xci.h index 436f7387c..ae7145b14 100644 --- a/src/core/loader/xci.h +++ b/src/core/loader/xci.h @@ -35,7 +35,7 @@ public: return IdentifyType(file); } - ResultStatus Load(Kernel::Process& process) override; + LoadResult Load(Kernel::Process& process) override; ResultStatus ReadRomFS(FileSys::VirtualFile& file) override; u64 ReadRomFSIVFCOffset() const override; diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 4e0538bc2..f18f6226b 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -26,16 +26,16 @@ namespace Memory { static Common::PageTable* current_page_table = nullptr; -void SetCurrentPageTable(Common::PageTable* page_table) { - current_page_table = page_table; +void SetCurrentPageTable(Kernel::Process& process) { + current_page_table = &process.VMManager().page_table; + + const std::size_t address_space_width = process.VMManager().GetAddressSpaceWidth(); auto& system = Core::System::GetInstance(); - if (system.IsPoweredOn()) { - system.ArmInterface(0).PageTableChanged(); - system.ArmInterface(1).PageTableChanged(); - system.ArmInterface(2).PageTableChanged(); - system.ArmInterface(3).PageTableChanged(); - } + system.ArmInterface(0).PageTableChanged(*current_page_table, address_space_width); + system.ArmInterface(1).PageTableChanged(*current_page_table, address_space_width); + system.ArmInterface(2).PageTableChanged(*current_page_table, address_space_width); + system.ArmInterface(3).PageTableChanged(*current_page_table, address_space_width); } static void MapPages(Common::PageTable& page_table, VAddr base, u64 size, u8* memory, diff --git a/src/core/memory.h b/src/core/memory.h index 6845f5fe1..b9fa18b1d 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -40,8 +40,9 @@ enum : VAddr { KERNEL_REGION_END = KERNEL_REGION_VADDR + KERNEL_REGION_SIZE, }; -/// Changes the currently active page table. -void SetCurrentPageTable(Common::PageTable* page_table); +/// Changes the currently active page table to that of +/// the given process instance. +void SetCurrentPageTable(Kernel::Process& process); /// Determines if the given VAddr is valid for the specified process. bool IsValidVirtualAddress(const Kernel::Process& process, VAddr vaddr); diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 1e31a2900..6821f275d 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -36,6 +36,8 @@ add_library(video_core STATIC renderer_base.h renderer_opengl/gl_buffer_cache.cpp renderer_opengl/gl_buffer_cache.h + renderer_opengl/gl_device.cpp + renderer_opengl/gl_device.h renderer_opengl/gl_global_cache.cpp renderer_opengl/gl_global_cache.h renderer_opengl/gl_primitive_assembler.cpp diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 046d047cb..6674d9405 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -57,8 +57,8 @@ bool DmaPusher::Step() { // Push buffer non-empty, read a word command_headers.resize(command_list_header.size); - gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(), - command_list_header.size * sizeof(u32)); + gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), + command_list_header.size * sizeof(u32)); for (const CommandHeader& command_header : command_headers) { diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index cd51a31d7..7387886a3 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -10,6 +10,7 @@ #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_base.h" +#include "video_core/textures/decoders.h" namespace Tegra::Engines { @@ -27,30 +28,46 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { switch (method_call.method) { case KEPLERMEMORY_REG_INDEX(exec): { - state.write_offset = 0; + ProcessExec(); break; } case KEPLERMEMORY_REG_INDEX(data): { - ProcessData(method_call.argument); + ProcessData(method_call.argument, method_call.IsLastCall()); break; } } } -void KeplerMemory::ProcessData(u32 data) { - ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported"); - ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0); - - // We have to invalidate the destination region to evict any outdated surfaces from the cache. - // We do this before actually writing the new data because the destination address might - // contain a dirty surface that will have to be written back to memory. - const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)}; - rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32)); - memory_manager.Write<u32>(address, data); - - system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); +void KeplerMemory::ProcessExec() { + state.write_offset = 0; + state.copy_size = regs.line_length_in * regs.line_count; + state.inner_buffer.resize(state.copy_size); +} - state.write_offset++; +void KeplerMemory::ProcessData(u32 data, bool is_last_call) { + const u32 sub_copy_size = std::min(4U, state.copy_size - state.write_offset); + std::memcpy(&state.inner_buffer[state.write_offset], ®s.data, sub_copy_size); + state.write_offset += sub_copy_size; + if (is_last_call) { + const GPUVAddr address{regs.dest.Address()}; + if (regs.exec.linear != 0) { + memory_manager.WriteBlock(address, state.inner_buffer.data(), state.copy_size); + } else { + UNIMPLEMENTED_IF(regs.dest.z != 0); + UNIMPLEMENTED_IF(regs.dest.depth != 1); + UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1); + UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1); + const std::size_t dst_size = Tegra::Texture::CalculateSize( + true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1); + std::vector<u8> tmp_buffer(dst_size); + memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size); + Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, + regs.dest.y, regs.dest.BlockHeight(), state.copy_size, + state.inner_buffer.data(), tmp_buffer.data()); + memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); + } + system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + } } } // namespace Tegra::Engines diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index 78b6c3e45..5f892ddad 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -6,6 +6,7 @@ #include <array> #include <cstddef> +#include <vector> #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" @@ -51,7 +52,11 @@ public: u32 address_high; u32 address_low; u32 pitch; - u32 block_dimensions; + union { + BitField<0, 4, u32> block_width; + BitField<4, 4, u32> block_height; + BitField<8, 4, u32> block_depth; + }; u32 width; u32 height; u32 depth; @@ -63,6 +68,18 @@ public: return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); } + + u32 BlockWidth() const { + return 1U << block_width.Value(); + } + + u32 BlockHeight() const { + return 1U << block_height.Value(); + } + + u32 BlockDepth() const { + return 1U << block_depth.Value(); + } } dest; struct { @@ -81,6 +98,8 @@ public: struct { u32 write_offset = 0; + u32 copy_size = 0; + std::vector<u8> inner_buffer; } state{}; private: @@ -88,7 +107,8 @@ private: VideoCore::RasterizerInterface& rasterizer; MemoryManager& memory_manager; - void ProcessData(u32 data); + void ProcessExec(); + void ProcessData(u32 data, bool is_last_call); }; #define ASSERT_REG_POSITION(field_name, position) \ diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index b198793bc..9780417f2 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -418,7 +418,7 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)}; Texture::TICEntry tic_entry; - memory_manager.ReadBlock(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); + memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear || tic_entry.header_version == Texture::TICHeaderVersion::Pitch, @@ -439,7 +439,7 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const { const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)}; Texture::TSCEntry tsc_entry; - memory_manager.ReadBlock(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry)); + memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry)); return tsc_entry; } diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index fce9733b9..e5b4eadea 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -937,21 +937,34 @@ union Instruction { } iset; union { - BitField<8, 2, Register::Size> dest_size; - BitField<10, 2, Register::Size> src_size; - BitField<12, 1, u64> is_output_signed; - BitField<13, 1, u64> is_input_signed; - BitField<41, 2, u64> selector; + BitField<41, 2, u64> selector; // i2i and i2f only BitField<45, 1, u64> negate_a; BitField<49, 1, u64> abs_a; + BitField<10, 2, Register::Size> src_size; + BitField<13, 1, u64> is_input_signed; + BitField<8, 2, Register::Size> dst_size; + BitField<12, 1, u64> is_output_signed; + + union { + BitField<39, 2, u64> tab5cb8_2; + } i2f; union { BitField<39, 2, F2iRoundingOp> rounding; } f2i; union { - BitField<39, 4, F2fRoundingOp> rounding; + BitField<8, 2, Register::Size> src_size; + BitField<10, 2, Register::Size> dst_size; + BitField<39, 4, u64> rounding; + // H0, H1 extract for F16 missing + BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value + F2fRoundingOp GetRoundingMode() const { + constexpr u64 rounding_mask = 0x0B; + return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask); + } } f2f; + } conversion; union { @@ -1734,7 +1747,7 @@ private: INST("0011100-00101---", Id::SHR_IMM, Type::Shift, "SHR_IMM"), INST("0100110011100---", Id::I2I_C, Type::Conversion, "I2I_C"), INST("0101110011100---", Id::I2I_R, Type::Conversion, "I2I_R"), - INST("01110001-1000---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"), + INST("0011101-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"), INST("0100110010111---", Id::I2F_C, Type::Conversion, "I2F_C"), INST("0101110010111---", Id::I2F_R, Type::Conversion, "I2F_R"), INST("0011100-10111---", Id::I2F_IMM, Type::Conversion, "I2F_IMM"), diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index de30ea354..fe6628923 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -207,6 +207,11 @@ public: }; } regs{}; + /// Performs any additional setup necessary in order to begin GPU emulation. + /// This can be used to launch any necessary threads and register any necessary + /// core timing events. + virtual void Start() = 0; + /// Push GPU command entries to be processed virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0; diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp index db507cf04..d4e2553a9 100644 --- a/src/video_core/gpu_asynch.cpp +++ b/src/video_core/gpu_asynch.cpp @@ -9,10 +9,14 @@ namespace VideoCommon { GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer) - : Tegra::GPU(system, renderer), gpu_thread{system, renderer, *dma_pusher} {} + : GPU(system, renderer), gpu_thread{system} {} GPUAsynch::~GPUAsynch() = default; +void GPUAsynch::Start() { + gpu_thread.StartThread(renderer, *dma_pusher); +} + void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) { gpu_thread.SubmitList(std::move(entries)); } diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h index 1dcc61a6c..30be74cba 100644 --- a/src/video_core/gpu_asynch.h +++ b/src/video_core/gpu_asynch.h @@ -13,16 +13,13 @@ class RendererBase; namespace VideoCommon { -namespace GPUThread { -class ThreadManager; -} // namespace GPUThread - /// Implementation of GPU interface that runs the GPU asynchronously class GPUAsynch : public Tegra::GPU { public: explicit GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer); ~GPUAsynch() override; + void Start() override; void PushGPUEntries(Tegra::CommandList&& entries) override; void SwapBuffers( std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp index 2cfc900ed..45e43b1dc 100644 --- a/src/video_core/gpu_synch.cpp +++ b/src/video_core/gpu_synch.cpp @@ -8,10 +8,12 @@ namespace VideoCommon { GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer) - : Tegra::GPU(system, renderer) {} + : GPU(system, renderer) {} GPUSynch::~GPUSynch() = default; +void GPUSynch::Start() {} + void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) { dma_pusher->Push(std::move(entries)); dma_pusher->DispatchCalls(); diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h index 766b5631c..3031fcf72 100644 --- a/src/video_core/gpu_synch.h +++ b/src/video_core/gpu_synch.h @@ -18,6 +18,7 @@ public: explicit GPUSynch(Core::System& system, VideoCore::RendererBase& renderer); ~GPUSynch() override; + void Start() override; void PushGPUEntries(Tegra::CommandList&& entries) override; void SwapBuffers( std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index cc56cf467..c9a2077de 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -55,19 +55,24 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p } } -ThreadManager::ThreadManager(Core::System& system, VideoCore::RendererBase& renderer, - Tegra::DmaPusher& dma_pusher) - : system{system}, thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)} { - synchronization_event = system.CoreTiming().RegisterEvent( - "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); }); -} +ThreadManager::ThreadManager(Core::System& system) : system{system} {} ThreadManager::~ThreadManager() { + if (!thread.joinable()) { + return; + } + // Notify GPU thread that a shutdown is pending PushCommand(EndProcessingCommand()); thread.join(); } +void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher) { + thread = std::thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)}; + synchronization_event = system.CoreTiming().RegisterEvent( + "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); }); +} + void ThreadManager::SubmitList(Tegra::CommandList&& entries) { const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))}; const s64 synchronization_ticks{Core::Timing::usToCycles(9000)}; diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 62bcea5bb..cc14527c7 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -138,10 +138,12 @@ struct SynchState final { /// Class used to manage the GPU thread class ThreadManager final { public: - explicit ThreadManager(Core::System& system, VideoCore::RendererBase& renderer, - Tegra::DmaPusher& dma_pusher); + explicit ThreadManager(Core::System& system); ~ThreadManager(); + /// Creates and starts the GPU thread. + void StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher); + /// Push GPU command entries to be processed void SubmitList(Tegra::CommandList&& entries); diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 0f4e820aa..6c98c6701 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -199,7 +199,15 @@ const u8* MemoryManager::GetPointer(GPUVAddr addr) const { return {}; } -void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const { +bool MemoryManager::IsBlockContinous(const GPUVAddr start, const std::size_t size) { + const GPUVAddr end = start + size; + const auto host_ptr_start = reinterpret_cast<std::uintptr_t>(GetPointer(start)); + const auto host_ptr_end = reinterpret_cast<std::uintptr_t>(GetPointer(end)); + const std::size_t range = static_cast<std::size_t>(host_ptr_end - host_ptr_start); + return range == size; +} + +void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const { std::size_t remaining_size{size}; std::size_t page_index{src_addr >> page_bits}; std::size_t page_offset{src_addr & page_mask}; @@ -226,7 +234,30 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t } } -void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) { +void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, + const std::size_t size) const { + std::size_t remaining_size{size}; + std::size_t page_index{src_addr >> page_bits}; + std::size_t page_offset{src_addr & page_mask}; + + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; + const u8* page_pointer = page_table.pointers[page_index]; + if (page_pointer) { + const u8* src_ptr{page_pointer + page_offset}; + std::memcpy(dest_buffer, src_ptr, copy_amount); + } else { + std::memset(dest_buffer, 0, copy_amount); + } + page_index++; + page_offset = 0; + dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount; + remaining_size -= copy_amount; + } +} + +void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) { std::size_t remaining_size{size}; std::size_t page_index{dest_addr >> page_bits}; std::size_t page_offset{dest_addr & page_mask}; @@ -253,7 +284,28 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std:: } } -void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) { +void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, + const std::size_t size) { + std::size_t remaining_size{size}; + std::size_t page_index{dest_addr >> page_bits}; + std::size_t page_offset{dest_addr & page_mask}; + + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; + u8* page_pointer = page_table.pointers[page_index]; + if (page_pointer) { + u8* dest_ptr{page_pointer + page_offset}; + std::memcpy(dest_ptr, src_buffer, copy_amount); + } + page_index++; + page_offset = 0; + src_buffer = static_cast<const u8*>(src_buffer) + copy_amount; + remaining_size -= copy_amount; + } +} + +void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { std::size_t remaining_size{size}; std::size_t page_index{src_addr >> page_bits}; std::size_t page_offset{src_addr & page_mask}; @@ -281,6 +333,12 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t } } +void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { + std::vector<u8> tmp_buffer(size); + ReadBlockUnsafe(src_addr, tmp_buffer.data(), size); + WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size); +} + void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type, VAddr backing_addr) { LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size, diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 647cbf93a..e4f0c4bd6 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -65,9 +65,32 @@ public: u8* GetPointer(GPUVAddr addr); const u8* GetPointer(GPUVAddr addr) const; - void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; - void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); - void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); + // Returns true if the block is continous in host memory, false otherwise + bool IsBlockContinous(const GPUVAddr start, const std::size_t size); + + /** + * ReadBlock and WriteBlock are full read and write operations over virtual + * GPU Memory. It's important to use these when GPU memory may not be continous + * in the Host Memory counterpart. Note: This functions cause Host GPU Memory + * Flushes and Invalidations, respectively to each operation. + */ + void ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const; + void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size); + void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size); + + /** + * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and + * WriteBlock respectively. In this versions, no flushing or invalidation is actually + * done and their performance is similar to a memcpy. This functions can be used + * on either of this 2 scenarios instead of their safe counterpart: + * - Memory which is sure to never be represented in the Host GPU. + * - Memory Managed by a Cache Manager. Example: Texture Flushing should use + * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture + * being flushed. + */ + void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const; + void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size); + void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size); private: using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>; diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp new file mode 100644 index 000000000..b6d9e0ddb --- /dev/null +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -0,0 +1,45 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstddef> +#include <glad/glad.h> + +#include "common/logging/log.h" +#include "video_core/renderer_opengl/gl_device.h" + +namespace OpenGL { + +namespace { +template <typename T> +T GetInteger(GLenum pname) { + GLint temporary; + glGetIntegerv(pname, &temporary); + return static_cast<T>(temporary); +} +} // Anonymous namespace + +Device::Device() { + uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); + has_variable_aoffi = TestVariableAoffi(); +} + +bool Device::TestVariableAoffi() { + const GLchar* AOFFI_TEST = R"(#version 430 core +uniform sampler2D tex; +uniform ivec2 variable_offset; +void main() { + gl_Position = textureOffset(tex, vec2(0), variable_offset); +} +)"; + const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)}; + GLint link_status{}; + glGetProgramiv(shader, GL_LINK_STATUS, &link_status); + glDeleteProgram(shader); + + const bool supported{link_status == GL_TRUE}; + LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", supported); + return supported; +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h new file mode 100644 index 000000000..78ff5ee58 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_device.h @@ -0,0 +1,30 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> + +namespace OpenGL { + +class Device { +public: + Device(); + + std::size_t GetUniformBufferAlignment() const { + return uniform_buffer_alignment; + } + + bool HasVariableAoffi() const { + return has_variable_aoffi; + } + +private: + static bool TestVariableAoffi(); + + std::size_t uniform_buffer_alignment{}; + bool has_variable_aoffi{}; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 6034dc489..9a088a503 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -99,7 +99,7 @@ struct FramebufferCacheKey { }; RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info) - : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, system{system}, + : res_cache{*this}, shader_cache{*this, system, device}, global_cache{*this}, system{system}, screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) { OpenGLState::ApplyDefaultState(); @@ -107,8 +107,6 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info) state.draw.shader_program = 0; state.Apply(); - glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment); - LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here"); CheckExtensions(); } @@ -315,8 +313,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { GLShader::MaxwellUniformData ubo{}; ubo.SetFromRegs(gpu, stage); - const GLintptr offset = buffer_cache.UploadHostMemory( - &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment)); + const GLintptr offset = + buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); // Bind the emulation info buffer bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, @@ -700,23 +698,24 @@ void RasterizerOpenGL::DrawArrays() { // Add space for index buffer (keeping in mind non-core primitives) switch (regs.draw.topology) { case Maxwell::PrimitiveTopology::Quads: - buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + + buffer_size = Common::AlignUp(buffer_size, 4) + primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count); break; default: if (is_indexed) { - buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + CalculateIndexBufferSize(); + buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize(); } break; } // Uniform space for the 5 shader stages - buffer_size = - Common::AlignUp<std::size_t>(buffer_size, 4) + - (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage; + buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + + (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * + Maxwell::MaxShaderStage; // Add space for at least 18 constant buffers - buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment); + buffer_size += + Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment()); const bool invalidate = buffer_cache.Map(buffer_size); if (invalidate) { @@ -848,8 +847,8 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader size = Common::AlignUp(size, sizeof(GLvec4)); ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big"); - const GLintptr const_buffer_offset = buffer_cache.UploadMemory( - buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment)); + const GLintptr const_buffer_offset = + buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index a0e056142..71b9c5ead 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -21,6 +21,7 @@ #include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_global_cache.h" #include "video_core/renderer_opengl/gl_primitive_assembler.h" #include "video_core/renderer_opengl/gl_rasterizer_cache.h" @@ -172,6 +173,7 @@ private: /// but are needed for correct emulation void CheckExtensions(); + const Device device; OpenGLState state; RasterizerCacheOpenGL res_cache; @@ -180,7 +182,6 @@ private: SamplerCacheOpenGL sampler_cache; Core::System& system; - ScreenInfo& screen_info; std::unique_ptr<GLShader::ProgramManager> shader_program_manager; @@ -196,7 +197,6 @@ private: static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; OGLBufferCache buffer_cache; PrimitiveAssembler primitive_assembler{buffer_cache}; - GLint uniform_buffer_alignment; BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index 7a68b8738..5a25f5b37 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -640,13 +640,16 @@ void CachedSurface::LoadGLBuffer() { SwizzleFunc(MortonSwizzleMode::MortonToLinear, params, gl_buffer[i], i); } else { const u32 bpp = params.GetFormatBpp() / 8; - const u32 copy_size = params.width * bpp; + const u32 copy_size = (params.width * bpp + GetDefaultBlockWidth(params.pixel_format) - 1) / + GetDefaultBlockWidth(params.pixel_format); if (params.pitch == copy_size) { std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl); } else { + const u32 height = (params.height + GetDefaultBlockHeight(params.pixel_format) - 1) / + GetDefaultBlockHeight(params.pixel_format); const u8* start{params.host_ptr}; u8* write_to = gl_buffer[0].data(); - for (u32 h = params.height; h > 0; h--) { + for (u32 h = height; h > 0; h--) { std::memcpy(write_to, start, copy_size); start += params.pitch; write_to += copy_size; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 99f67494c..2a81b1169 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -38,13 +38,15 @@ GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) { } /// Gets the shader program code from memory for the specified address -ProgramCode GetShaderCode(const u8* host_ptr) { +ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr, + const u8* host_ptr) { ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); ASSERT_OR_EXECUTE(host_ptr != nullptr, { std::fill(program_code.begin(), program_code.end(), 0); return program_code; }); - std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64)); + memory_manager.ReadBlockUnsafe(gpu_addr, program_code.data(), + program_code.size() * sizeof(u64)); return program_code; } @@ -134,8 +136,8 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& } /// Creates an unspecialized program from code streams -GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, ProgramCode program_code, - ProgramCode program_code_b) { +GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type, + ProgramCode program_code, ProgramCode program_code_b) { GLShader::ShaderSetup setup(program_code); if (program_type == Maxwell::ShaderProgram::VertexA) { // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders. @@ -149,11 +151,11 @@ GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, Progr switch (program_type) { case Maxwell::ShaderProgram::VertexA: case Maxwell::ShaderProgram::VertexB: - return GLShader::GenerateVertexShader(setup); + return GLShader::GenerateVertexShader(device, setup); case Maxwell::ShaderProgram::Geometry: - return GLShader::GenerateGeometryShader(setup); + return GLShader::GenerateGeometryShader(device, setup); case Maxwell::ShaderProgram::Fragment: - return GLShader::GenerateFragmentShader(setup); + return GLShader::GenerateFragmentShader(device, setup); default: LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type)); UNREACHABLE(); @@ -212,22 +214,20 @@ std::set<GLenum> GetSupportedFormats() { return supported_formats; } -} // namespace +} // Anonymous namespace -CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier, +CachedShader::CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, const PrecompiledPrograms& precompiled_programs, ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr) : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{precompiled_programs} { - - const std::size_t code_size = CalculateProgramSize(program_code); - const std::size_t code_size_b = - program_code_b.empty() ? 0 : CalculateProgramSize(program_code_b); - - GLShader::ProgramResult program_result = - CreateProgram(program_type, program_code, program_code_b); + const std::size_t code_size{CalculateProgramSize(program_code)}; + const std::size_t code_size_b{program_code_b.empty() ? 0 + : CalculateProgramSize(program_code_b)}; + GLShader::ProgramResult program_result{ + CreateProgram(device, program_type, program_code, program_code_b)}; if (program_result.first.empty()) { // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now return; @@ -251,7 +251,6 @@ CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier, : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{ precompiled_programs} { - code = std::move(result.first); entries = result.second; shader_length = entries.shader_length; @@ -344,8 +343,9 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode, return {unique_identifier, base_bindings, primitive_mode}; } -ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system) - : RasterizerCache{rasterizer}, disk_cache{system} {} +ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, + const Device& device) + : RasterizerCache{rasterizer}, disk_cache{system}, device{device} {} void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { @@ -439,17 +439,18 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia const std::unordered_map<u64, ShaderDiskCacheDecompiled>& decompiled) { std::unordered_map<u64, UnspecializedShader> unspecialized; - if (callback) + if (callback) { callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size()); + } for (std::size_t i = 0; i < raws.size(); ++i) { - if (stop_loading) + if (stop_loading) { return {}; - + } const auto& raw{raws[i]}; - const u64 unique_identifier = raw.GetUniqueIdentifier(); - const u64 calculated_hash = - GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB()); + const u64 unique_identifier{raw.GetUniqueIdentifier()}; + const u64 calculated_hash{ + GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB())}; if (unique_identifier != calculated_hash) { LOG_ERROR( Render_OpenGL, @@ -466,8 +467,8 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia result = {stored_decompiled.code, stored_decompiled.entries}; } else { // Otherwise decompile the shader at boot and save the result to the decompiled file - result = - CreateProgram(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB()); + result = CreateProgram(device, raw.GetProgramType(), raw.GetProgramCode(), + raw.GetProgramCodeB()); disk_cache.SaveDecompiled(unique_identifier, result.first, result.second); } @@ -477,8 +478,9 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia {raw.GetUniqueIdentifier(), {std::move(result.first), std::move(result.second), raw.GetProgramType()}}); - if (callback) + if (callback) { callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size()); + } } return unspecialized; } @@ -497,11 +499,12 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { if (!shader) { // No shader found - create a new one - ProgramCode program_code{GetShaderCode(host_ptr)}; + ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)}; ProgramCode program_code_b; if (program == Maxwell::ShaderProgram::VertexA) { - program_code_b = GetShaderCode( - memory_manager.GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB))); + const GPUVAddr program_addr_b{GetShaderAddress(Maxwell::ShaderProgram::VertexB)}; + program_code_b = GetShaderCode(memory_manager, program_addr_b, + memory_manager.GetPointer(program_addr_b)); } const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)}; @@ -512,7 +515,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { precompiled_programs, found->second, host_ptr); } else { shader = std::make_shared<CachedShader>( - cpu_addr, unique_identifier, program, disk_cache, precompiled_programs, + device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs, std::move(program_code), std::move(program_code_b), host_ptr); } Register(shader); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 0cf8e0b3d..a332087f8 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -27,6 +27,7 @@ class System; namespace OpenGL { class CachedShader; +class Device; class RasterizerOpenGL; struct UnspecializedShader; @@ -38,7 +39,7 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>; class CachedShader final : public RasterizerCacheObject { public: - explicit CachedShader(VAddr cpu_addr, u64 unique_identifier, + explicit CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, const PrecompiledPrograms& precompiled_programs, ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr); @@ -112,7 +113,8 @@ private: class ShaderCacheOpenGL final : public RasterizerCache<Shader> { public: - explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system); + explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, + const Device& device); /// Loads disk cache for the current game void LoadDiskCache(const std::atomic_bool& stop_loading, @@ -130,6 +132,8 @@ private: CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump, const std::set<GLenum>& supported_formats); + const Device& device; + std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; ShaderDiskCacheOpenGL disk_cache; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 445048daf..ef1a1995f 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -15,6 +15,7 @@ #include "common/assert.h" #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/shader/shader_ir.h" @@ -119,14 +120,10 @@ std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { /// Returns true if an object has to be treated as precise bool IsPrecise(Operation operand) { - const auto& meta = operand.GetMeta(); - + const auto& meta{operand.GetMeta()}; if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) { return arithmetic->precise; } - if (const auto half_arithmetic = std::get_if<MetaHalfArithmetic>(&meta)) { - return half_arithmetic->precise; - } return false; } @@ -139,8 +136,9 @@ bool IsPrecise(Node node) { class GLSLDecompiler final { public: - explicit GLSLDecompiler(const ShaderIR& ir, ShaderStage stage, std::string suffix) - : ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} + explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage, + std::string suffix) + : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} void Decompile() { DeclareVertex(); @@ -627,28 +625,7 @@ private: } std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) { - std::string value = VisitOperand(operation, operand_index); - switch (type) { - case Type::HalfFloat: { - const auto half_meta = std::get_if<MetaHalfArithmetic>(&operation.GetMeta()); - if (!half_meta) { - value = "toHalf2(" + value + ')'; - } - - switch (half_meta->types.at(operand_index)) { - case Tegra::Shader::HalfType::H0_H1: - return "toHalf2(" + value + ')'; - case Tegra::Shader::HalfType::F32: - return "vec2(" + value + ')'; - case Tegra::Shader::HalfType::H0_H0: - return "vec2(toHalf2(" + value + ")[0])"; - case Tegra::Shader::HalfType::H1_H1: - return "vec2(toHalf2(" + value + ")[1])"; - } - } - default: - return CastOperand(value, type); - } + return CastOperand(VisitOperand(operation, operand_index), type); } std::string CastOperand(const std::string& value, Type type) const { @@ -662,9 +639,7 @@ private: case Type::Uint: return "ftou(" + value + ')'; case Type::HalfFloat: - // Can't be handled as a stand-alone value - UNREACHABLE(); - return value; + return "toHalf2(" + value + ')'; } UNREACHABLE(); return value; @@ -829,8 +804,12 @@ private: // Inline the string as an immediate integer in GLSL (AOFFI arguments are required // to be constant by the standard). expr += std::to_string(static_cast<s32>(immediate->GetValue())); - } else { + } else if (device.HasVariableAoffi()) { + // Avoid using variable AOFFI on unsupported devices. expr += "ftoi(" + Visit(operand) + ')'; + } else { + // Insert 0 on devices not supporting variable AOFFI. + expr += '0'; } if (index + 1 < aoffi.size()) { expr += ", "; @@ -1083,13 +1062,40 @@ private: return BitwiseCastResult(value, Type::HalfFloat); } + std::string HClamp(Operation operation) { + const std::string value = VisitOperand(operation, 0, Type::HalfFloat); + const std::string min = VisitOperand(operation, 1, Type::Float); + const std::string max = VisitOperand(operation, 2, Type::Float); + const std::string clamped = "clamp(" + value + ", vec2(" + min + "), vec2(" + max + "))"; + return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat)); + } + + std::string HUnpack(Operation operation) { + const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)}; + const auto value = [&]() -> std::string { + switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) { + case Tegra::Shader::HalfType::H0_H1: + return operand; + case Tegra::Shader::HalfType::F32: + return "vec2(fromHalf2(" + operand + "))"; + case Tegra::Shader::HalfType::H0_H0: + return "vec2(" + operand + "[0])"; + case Tegra::Shader::HalfType::H1_H1: + return "vec2(" + operand + "[1])"; + } + UNREACHABLE(); + return "0"; + }(); + return "fromHalf2(" + value + ')'; + } + std::string HMergeF32(Operation operation) { return "float(toHalf2(" + Visit(operation[0]) + ")[0])"; } std::string HMergeH0(Operation operation) { - return "fromHalf2(vec2(toHalf2(" + Visit(operation[0]) + ")[1], toHalf2(" + - Visit(operation[1]) + ")[0]))"; + return "fromHalf2(vec2(toHalf2(" + Visit(operation[1]) + ")[0], toHalf2(" + + Visit(operation[0]) + ")[1]))"; } std::string HMergeH1(Operation operation) { @@ -1189,34 +1195,46 @@ private: return GenerateUnary(operation, "any", Type::Bool, Type::Bool2); } + template <bool with_nan> + std::string GenerateHalfComparison(Operation operation, std::string compare_op) { + std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2, + Type::HalfFloat, Type::HalfFloat)}; + if constexpr (!with_nan) { + return comparison; + } + return "halfFloatNanComparison(" + comparison + ", " + + VisitOperand(operation, 0, Type::HalfFloat) + ", " + + VisitOperand(operation, 1, Type::HalfFloat) + ')'; + } + + template <bool with_nan> std::string Logical2HLessThan(Operation operation) { - return GenerateBinaryCall(operation, "lessThan", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "lessThan"); } + template <bool with_nan> std::string Logical2HEqual(Operation operation) { - return GenerateBinaryCall(operation, "equal", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "equal"); } + template <bool with_nan> std::string Logical2HLessEqual(Operation operation) { - return GenerateBinaryCall(operation, "lessThanEqual", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "lessThanEqual"); } + template <bool with_nan> std::string Logical2HGreaterThan(Operation operation) { - return GenerateBinaryCall(operation, "greaterThan", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "greaterThan"); } + template <bool with_nan> std::string Logical2HNotEqual(Operation operation) { - return GenerateBinaryCall(operation, "notEqual", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "notEqual"); } + template <bool with_nan> std::string Logical2HGreaterEqual(Operation operation) { - return GenerateBinaryCall(operation, "greaterThanEqual", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "greaterThanEqual"); } std::string Texture(Operation operation) { @@ -1505,6 +1523,8 @@ private: &GLSLDecompiler::Fma<Type::HalfFloat>, &GLSLDecompiler::Absolute<Type::HalfFloat>, &GLSLDecompiler::HNegate, + &GLSLDecompiler::HClamp, + &GLSLDecompiler::HUnpack, &GLSLDecompiler::HMergeF32, &GLSLDecompiler::HMergeH0, &GLSLDecompiler::HMergeH1, @@ -1541,12 +1561,18 @@ private: &GLSLDecompiler::LogicalNotEqual<Type::Uint>, &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>, - &GLSLDecompiler::Logical2HLessThan, - &GLSLDecompiler::Logical2HEqual, - &GLSLDecompiler::Logical2HLessEqual, - &GLSLDecompiler::Logical2HGreaterThan, - &GLSLDecompiler::Logical2HNotEqual, - &GLSLDecompiler::Logical2HGreaterEqual, + &GLSLDecompiler::Logical2HLessThan<false>, + &GLSLDecompiler::Logical2HEqual<false>, + &GLSLDecompiler::Logical2HLessEqual<false>, + &GLSLDecompiler::Logical2HGreaterThan<false>, + &GLSLDecompiler::Logical2HNotEqual<false>, + &GLSLDecompiler::Logical2HGreaterEqual<false>, + &GLSLDecompiler::Logical2HLessThan<true>, + &GLSLDecompiler::Logical2HEqual<true>, + &GLSLDecompiler::Logical2HLessEqual<true>, + &GLSLDecompiler::Logical2HGreaterThan<true>, + &GLSLDecompiler::Logical2HNotEqual<true>, + &GLSLDecompiler::Logical2HGreaterEqual<true>, &GLSLDecompiler::Texture, &GLSLDecompiler::TextureLod, @@ -1625,6 +1651,7 @@ private: return name + '_' + std::to_string(index) + '_' + suffix; } + const Device& device; const ShaderIR& ir; const ShaderStage stage; const std::string suffix; @@ -1647,11 +1674,18 @@ std::string GetCommonDeclarations() { "}\n\n" "vec2 toHalf2(float value) {\n" " return unpackHalf2x16(ftou(value));\n" + "}\n\n" + "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {\n" + " bvec2 is_nan1 = isnan(pair1);\n" + " bvec2 is_nan2 = isnan(pair2);\n" + " return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || " + "is_nan2.y);\n" "}\n"; } -ProgramResult Decompile(const ShaderIR& ir, Maxwell::ShaderStage stage, const std::string& suffix) { - GLSLDecompiler decompiler(ir, stage, suffix); +ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage, + const std::string& suffix) { + GLSLDecompiler decompiler(device, ir, stage, suffix); decompiler.Decompile(); return {decompiler.GetResult(), decompiler.GetShaderEntries()}; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 74032d237..c1569e737 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -12,6 +12,10 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/shader/shader_ir.h" +namespace OpenGL { +class Device; +} + namespace VideoCommon::Shader { class ShaderIR; } @@ -77,7 +81,7 @@ struct ShaderEntries { std::string GetCommonDeclarations(); -ProgramResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage, - const std::string& suffix); +ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + Maxwell::ShaderStage stage, const std::string& suffix); } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index 8763d9c71..6abf948f8 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -16,7 +16,7 @@ using VideoCommon::Shader::ShaderIR; static constexpr u32 PROGRAM_OFFSET{10}; -ProgramResult GenerateVertexShader(const ShaderSetup& setup) { +ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) { const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; @@ -34,14 +34,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config { )"; ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); - ProgramResult program = Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex"); + ProgramResult program = + Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex"); out += program.first; if (setup.IsDualProgram()) { ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET); ProgramResult program_b = - Decompile(program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b"); + Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b"); out += program_b.first; } @@ -57,6 +58,9 @@ void main() { } out += R"( + + // Set Position Y direction + position.y *= utof(config_pack[2]); // Check if the flip stage is VertexB // Config pack's second value is flip_stage if (config_pack[1] == 1) { @@ -75,7 +79,7 @@ void main() { return {out, program.second}; } -ProgramResult GenerateGeometryShader(const ShaderSetup& setup) { +ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup) { const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; @@ -95,7 +99,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config { )"; ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); ProgramResult program = - Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry"); + Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry"); out += program.first; out += R"( @@ -106,7 +110,7 @@ void main() { return {out, program.second}; } -ProgramResult GenerateFragmentShader(const ShaderSetup& setup) { +ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup) { const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; @@ -158,7 +162,7 @@ bool AlphaFunc(in float value) { )"; ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); ProgramResult program = - Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment"); + Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment"); out += program.first; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h index fad346b48..0536c8a03 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ b/src/video_core/renderer_opengl/gl_shader_gen.h @@ -10,6 +10,10 @@ #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/shader/shader_ir.h" +namespace OpenGL { +class Device; +} + namespace OpenGL::GLShader { using VideoCommon::Shader::ProgramCode; @@ -39,22 +43,13 @@ private: bool has_program_b{}; }; -/** - * Generates the GLSL vertex shader program source code for the given VS program - * @returns String of the shader source code - */ -ProgramResult GenerateVertexShader(const ShaderSetup& setup); - -/** - * Generates the GLSL geometry shader program source code for the given GS program - * @returns String of the shader source code - */ -ProgramResult GenerateGeometryShader(const ShaderSetup& setup); - -/** - * Generates the GLSL fragment shader program source code for the given FS program - * @returns String of the shader source code - */ -ProgramResult GenerateFragmentShader(const ShaderSetup& setup); +/// Generates the GLSL vertex shader program source code for the given VS program +ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup); + +/// Generates the GLSL geometry shader program source code for the given GS program +ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup); + +/// Generates the GLSL fragment shader program source code for the given FS program +ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup); } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 25500f9a3..23d9b10db 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -76,14 +76,10 @@ constexpr u32 GetGenericAttributeLocation(Attribute::Index attribute) { /// Returns true if an object has to be treated as precise bool IsPrecise(Operation operand) { - const auto& meta = operand.GetMeta(); - + const auto& meta{operand.GetMeta()}; if (std::holds_alternative<MetaArithmetic>(meta)) { return std::get<MetaArithmetic>(meta).precise; } - if (std::holds_alternative<MetaHalfArithmetic>(meta)) { - return std::get<MetaHalfArithmetic>(meta).precise; - } return false; } @@ -746,6 +742,16 @@ private: return {}; } + Id HClamp(Operation operation) { + UNIMPLEMENTED(); + return {}; + } + + Id HUnpack(Operation operation) { + UNIMPLEMENTED(); + return {}; + } + Id HMergeF32(Operation operation) { UNIMPLEMENTED(); return {}; @@ -1218,6 +1224,8 @@ private: &SPIRVDecompiler::Ternary<&Module::OpFma, Type::HalfFloat>, &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>, &SPIRVDecompiler::HNegate, + &SPIRVDecompiler::HClamp, + &SPIRVDecompiler::HUnpack, &SPIRVDecompiler::HMergeF32, &SPIRVDecompiler::HMergeH0, &SPIRVDecompiler::HMergeH1, @@ -1260,6 +1268,13 @@ private: &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>, &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>, &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>, + // TODO(Rodrigo): Should these use the OpFUnord* variants? + &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>, &SPIRVDecompiler::Texture, &SPIRVDecompiler::TextureLod, diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp index baee89107..9467f9417 100644 --- a/src/video_core/shader/decode/arithmetic_half.cpp +++ b/src/video_core/shader/decode/arithmetic_half.cpp @@ -18,7 +18,9 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) { if (opcode->get().GetId() == OpCode::Id::HADD2_C || opcode->get().GetId() == OpCode::Id::HADD2_R) { - UNIMPLEMENTED_IF(instr.alu_half.ftz != 0); + if (instr.alu_half.ftz != 0) { + LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + } } UNIMPLEMENTED_IF_MSG(instr.alu_half.saturate != 0, "Half float saturation not implemented"); @@ -27,9 +29,8 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) { const bool negate_b = opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0; - const Node op_a = GetOperandAbsNegHalf(GetRegister(instr.gpr8), instr.alu_half.abs_a, negate_a); - - // instr.alu_half.type_a + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a); + op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a); Node op_b = [&]() { switch (opcode->get().GetId()) { @@ -44,17 +45,17 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) { return Immediate(0); } }(); + op_b = UnpackHalfFloat(op_b, instr.alu_half.type_b); op_b = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b); Node value = [&]() { - MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a, instr.alu_half.type_b}}; switch (opcode->get().GetId()) { case OpCode::Id::HADD2_C: case OpCode::Id::HADD2_R: - return Operation(OperationCode::HAdd, meta, op_a, op_b); + return Operation(OperationCode::HAdd, PRECISE, op_a, op_b); case OpCode::Id::HMUL2_C: case OpCode::Id::HMUL2_R: - return Operation(OperationCode::HMul, meta, op_a, op_b); + return Operation(OperationCode::HMul, PRECISE, op_a, op_b); default: UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName()); return Immediate(0); diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp index c2164ba50..fbcd35b18 100644 --- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp +++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp @@ -17,34 +17,33 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) { - UNIMPLEMENTED_IF(instr.alu_half_imm.ftz != 0); + if (instr.alu_half_imm.ftz != 0) { + LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + } } else { UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None); } - UNIMPLEMENTED_IF_MSG(instr.alu_half_imm.saturate != 0, - "Half float immediate saturation not implemented"); - Node op_a = GetRegister(instr.gpr8); + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a); op_a = GetOperandAbsNegHalf(op_a, instr.alu_half_imm.abs_a, instr.alu_half_imm.negate_a); const Node op_b = UnpackHalfImmediate(instr, true); Node value = [&]() { - MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a}}; switch (opcode->get().GetId()) { case OpCode::Id::HADD2_IMM: - return Operation(OperationCode::HAdd, meta, op_a, op_b); + return Operation(OperationCode::HAdd, PRECISE, op_a, op_b); case OpCode::Id::HMUL2_IMM: - return Operation(OperationCode::HMul, meta, op_a, op_b); + return Operation(OperationCode::HMul, PRECISE, op_a, op_b); default: UNREACHABLE(); return Immediate(0); } }(); - value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge); + value = GetSaturatedHalfFloat(value, instr.alu_half_imm.saturate); + value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge); SetRegister(bb, instr.gpr0, value); - return pc; } diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp index 55a6fbbf2..ba15b1115 100644 --- a/src/video_core/shader/decode/conversion.cpp +++ b/src/video_core/shader/decode/conversion.cpp @@ -18,13 +18,29 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); switch (opcode->get().GetId()) { - case OpCode::Id::I2I_R: { + case OpCode::Id::I2I_R: + case OpCode::Id::I2I_C: + case OpCode::Id::I2I_IMM: { UNIMPLEMENTED_IF(instr.conversion.selector); + UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word); + UNIMPLEMENTED_IF(instr.alu.saturate_d); const bool input_signed = instr.conversion.is_input_signed; const bool output_signed = instr.conversion.is_output_signed; - Node value = GetRegister(instr.gpr20); + Node value = [&]() { + switch (opcode->get().GetId()) { + case OpCode::Id::I2I_R: + return GetRegister(instr.gpr20); + case OpCode::Id::I2I_C: + return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::I2I_IMM: + return Immediate(instr.alu.GetSignedImm20_20()); + default: + UNREACHABLE(); + return Immediate(0); + } + }(); value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed); value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a, @@ -38,17 +54,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::I2F_R: - case OpCode::Id::I2F_C: { - UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word); + case OpCode::Id::I2F_C: + case OpCode::Id::I2F_IMM: { + UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word); UNIMPLEMENTED_IF(instr.conversion.selector); UNIMPLEMENTED_IF_MSG(instr.generates_cc, "Condition codes generation in I2F is not implemented"); Node value = [&]() { - if (instr.is_b_gpr) { + switch (opcode->get().GetId()) { + case OpCode::Id::I2F_R: return GetRegister(instr.gpr20); - } else { + case OpCode::Id::I2F_C: return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::I2F_IMM: + return Immediate(instr.alu.GetSignedImm20_20()); + default: + UNREACHABLE(); + return Immediate(0); } }(); const bool input_signed = instr.conversion.is_input_signed; @@ -62,24 +85,31 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::F2F_R: - case OpCode::Id::F2F_C: { - UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word); - UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word); + case OpCode::Id::F2F_C: + case OpCode::Id::F2F_IMM: { + UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word); + UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word); UNIMPLEMENTED_IF_MSG(instr.generates_cc, "Condition codes generation in F2F is not implemented"); Node value = [&]() { - if (instr.is_b_gpr) { + switch (opcode->get().GetId()) { + case OpCode::Id::F2F_R: return GetRegister(instr.gpr20); - } else { + case OpCode::Id::F2F_C: return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::F2F_IMM: + return GetImmediate19(instr); + default: + UNREACHABLE(); + return Immediate(0); } }(); value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a); value = [&]() { - switch (instr.conversion.f2f.rounding) { + switch (instr.conversion.f2f.GetRoundingMode()) { case Tegra::Shader::F2fRoundingOp::None: return value; case Tegra::Shader::F2fRoundingOp::Round: @@ -102,15 +132,22 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::F2I_R: - case OpCode::Id::F2I_C: { + case OpCode::Id::F2I_C: + case OpCode::Id::F2I_IMM: { UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word); UNIMPLEMENTED_IF_MSG(instr.generates_cc, "Condition codes generation in F2I is not implemented"); Node value = [&]() { - if (instr.is_b_gpr) { + switch (opcode->get().GetId()) { + case OpCode::Id::F2I_R: return GetRegister(instr.gpr20); - } else { + case OpCode::Id::F2I_C: return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::F2I_IMM: + return GetImmediate19(instr); + default: + UNREACHABLE(); + return Immediate(0); } }(); @@ -134,7 +171,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { }(); const bool is_signed = instr.conversion.is_output_signed; value = SignedOperation(OperationCode::ICastFloat, is_signed, PRECISE, value); - value = ConvertIntegerSize(value, instr.conversion.dest_size, is_signed); + value = ConvertIntegerSize(value, instr.conversion.dst_size, is_signed); SetRegister(bb, instr.gpr0, value); break; diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp index 748368555..1dd94bf9d 100644 --- a/src/video_core/shader/decode/half_set.cpp +++ b/src/video_core/shader/decode/half_set.cpp @@ -18,11 +18,13 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.hset2.ftz != 0); + if (instr.hset2.ftz != 0) { + LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + } + + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a); + op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a); - // instr.hset2.type_a - // instr.hset2.type_b - Node op_a = GetRegister(instr.gpr8); Node op_b = [&]() { switch (opcode->get().GetId()) { case OpCode::Id::HSET2_R: @@ -32,14 +34,12 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) { return Immediate(0); } }(); - - op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a); + op_b = UnpackHalfFloat(op_b, instr.hset2.type_b); op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b); const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred); - MetaHalfArithmetic meta{false, {instr.hset2.type_a, instr.hset2.type_b}}; - const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, meta, op_a, op_b); + const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b); const OperationCode combiner = GetPredicateCombiner(instr.hset2.op); diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp index e68512692..6e59eb650 100644 --- a/src/video_core/shader/decode/half_set_predicate.cpp +++ b/src/video_core/shader/decode/half_set_predicate.cpp @@ -19,10 +19,10 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0); - Node op_a = GetRegister(instr.gpr8); + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a); op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a); - const Node op_b = [&]() { + Node op_b = [&]() { switch (opcode->get().GetId()) { case OpCode::Id::HSETP2_R: return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a, @@ -32,6 +32,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { return Immediate(0); } }(); + op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b); // We can't use the constant predicate as destination. ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex)); @@ -42,8 +43,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { const OperationCode pair_combiner = instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2; - MetaHalfArithmetic meta = {false, {instr.hsetp2.type_a, instr.hsetp2.type_b}}; - const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, meta, op_a, op_b); + const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b); const Node first_pred = Operation(pair_combiner, comparison); // Set the primary predicate to the result of Predicate OP SecondPredicate diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp index 7a07c5ec6..5c1becce5 100644 --- a/src/video_core/shader/decode/hfma2.cpp +++ b/src/video_core/shader/decode/hfma2.cpp @@ -27,10 +27,6 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) { } constexpr auto identity = HalfType::H0_H1; - - const HalfType type_a = instr.hfma2.type_a; - const Node op_a = GetRegister(instr.gpr8); - bool neg_b{}, neg_c{}; auto [saturate, type_b, op_b, type_c, op_c] = [&]() -> std::tuple<bool, HalfType, Node, HalfType, Node> { @@ -62,11 +58,11 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) { }(); UNIMPLEMENTED_IF_MSG(saturate, "HFMA2 saturation is not implemented"); - op_b = GetOperandAbsNegHalf(op_b, false, neg_b); - op_c = GetOperandAbsNegHalf(op_c, false, neg_c); + const Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hfma2.type_a); + op_b = GetOperandAbsNegHalf(UnpackHalfFloat(op_b, type_b), false, neg_b); + op_c = GetOperandAbsNegHalf(UnpackHalfFloat(op_c, type_c), false, neg_c); - MetaHalfArithmetic meta{true, {type_a, type_b, type_c}}; - Node value = Operation(OperationCode::HFma, meta, op_a, op_b, op_c); + Node value = Operation(OperationCode::HFma, PRECISE, op_a, op_b, op_c); value = HalfMerge(GetRegister(instr.gpr0), value, instr.hfma2.merge); SetRegister(bb, instr.gpr0, value); diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index ac5112d78..17f2f711c 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -189,7 +189,11 @@ Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) { const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0); const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0); - return Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, first_negate, second_negate); + return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate); +} + +Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) { + return Operation(OperationCode::HUnpack, type, value); } Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) { @@ -209,17 +213,26 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) { Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) { if (absolute) { - value = Operation(OperationCode::HAbsolute, HALF_NO_PRECISE, value); + value = Operation(OperationCode::HAbsolute, NO_PRECISE, value); } if (negate) { - value = Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, GetPredicate(true), + value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true), GetPredicate(true)); } return value; } +Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) { + if (!saturate) { + return value; + } + const Node positive_zero = Immediate(std::copysignf(0, 1)); + const Node positive_one = Immediate(1.0f); + return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one); +} + Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) { - static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { + const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { {PredCondition::LessThan, OperationCode::LogicalFLessThan}, {PredCondition::Equal, OperationCode::LogicalFEqual}, {PredCondition::LessEqual, OperationCode::LogicalFLessEqual}, @@ -255,7 +268,7 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_signed, Node op_a, Node op_b) { - static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { + const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { {PredCondition::LessThan, OperationCode::LogicalILessThan}, {PredCondition::Equal, OperationCode::LogicalIEqual}, {PredCondition::LessEqual, OperationCode::LogicalILessEqual}, @@ -283,40 +296,32 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si return predicate; } -Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, - const MetaHalfArithmetic& meta, Node op_a, Node op_b) { - - UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan || - condition == PredCondition::NotEqualWithNan || - condition == PredCondition::LessEqualWithNan || - condition == PredCondition::GreaterThanWithNan || - condition == PredCondition::GreaterEqualWithNan, - "Unimplemented NaN comparison for half floats"); - - static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { +Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a, + Node op_b) { + const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { {PredCondition::LessThan, OperationCode::Logical2HLessThan}, {PredCondition::Equal, OperationCode::Logical2HEqual}, {PredCondition::LessEqual, OperationCode::Logical2HLessEqual}, {PredCondition::GreaterThan, OperationCode::Logical2HGreaterThan}, {PredCondition::NotEqual, OperationCode::Logical2HNotEqual}, {PredCondition::GreaterEqual, OperationCode::Logical2HGreaterEqual}, - {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThan}, - {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqual}, - {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqual}, - {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThan}, - {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqual}}; + {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThanWithNan}, + {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqualWithNan}, + {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqualWithNan}, + {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThanWithNan}, + {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqualWithNan}}; const auto comparison{PredicateComparisonTable.find(condition)}; UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(), "Unknown predicate comparison operation"); - const Node predicate = Operation(comparison->second, meta, op_a, op_b); + const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b); return predicate; } OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) { - static const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = { + const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = { {PredOperation::And, OperationCode::LogicalAnd}, {PredOperation::Or, OperationCode::LogicalOr}, {PredOperation::Xor, OperationCode::LogicalXor}, diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 57af8b10f..81278fb33 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -109,11 +109,13 @@ enum class OperationCode { UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint UBitCount, /// (MetaArithmetic, uint) -> uint - HAdd, /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 - HMul, /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 - HFma, /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2 + HAdd, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 + HMul, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 + HFma, /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2 HAbsolute, /// (f16vec2 a) -> f16vec2 HNegate, /// (f16vec2 a, bool first, bool second) -> f16vec2 + HClamp, /// (f16vec2 src, float min, float max) -> f16vec2 + HUnpack, /// (Tegra::Shader::HalfType, T value) -> f16vec2 HMergeF32, /// (f16vec2 src) -> float HMergeH0, /// (f16vec2 dest, f16vec2 src) -> f16vec2 HMergeH1, /// (f16vec2 dest, f16vec2 src) -> f16vec2 @@ -150,12 +152,18 @@ enum class OperationCode { LogicalUNotEqual, /// (uint a, uint b) -> bool LogicalUGreaterEqual, /// (uint a, uint b) -> bool - Logical2HLessThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HLessEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HGreaterThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HNotEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HGreaterEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HLessThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HLessEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HGreaterThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HNotEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HGreaterEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HLessThanWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HLessEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HGreaterThanWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HNotEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HGreaterEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 Texture, /// (MetaTexture, float[N] coords) -> float4 TextureLod, /// (MetaTexture, float[N] coords) -> float4 @@ -308,13 +316,6 @@ struct MetaArithmetic { bool precise{}; }; -struct MetaHalfArithmetic { - bool precise{}; - std::array<Tegra::Shader::HalfType, 3> types = {Tegra::Shader::HalfType::H0_H1, - Tegra::Shader::HalfType::H0_H1, - Tegra::Shader::HalfType::H0_H1}; -}; - struct MetaTexture { const Sampler& sampler; Node array{}; @@ -326,11 +327,10 @@ struct MetaTexture { u32 element{}; }; -constexpr MetaArithmetic PRECISE = {true}; -constexpr MetaArithmetic NO_PRECISE = {false}; -constexpr MetaHalfArithmetic HALF_NO_PRECISE = {false}; +inline constexpr MetaArithmetic PRECISE = {true}; +inline constexpr MetaArithmetic NO_PRECISE = {false}; -using Meta = std::variant<MetaArithmetic, MetaHalfArithmetic, MetaTexture>; +using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>; /// Holds any kind of operation that can be done in the IR class OperationNode final { @@ -734,10 +734,14 @@ private: /// Unpacks a half immediate from an instruction Node UnpackHalfImmediate(Tegra::Shader::Instruction instr, bool has_negation); + /// Unpacks a binary value into a half float pair with a type format + Node UnpackHalfFloat(Node value, Tegra::Shader::HalfType type); /// Merges a half pair into another value Node HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge); /// Conditionally absolute/negated half float pair. Absolute is applied first Node GetOperandAbsNegHalf(Node value, bool absolute, bool negate); + /// Conditionally saturates a half float pair + Node GetSaturatedHalfFloat(Node value, bool saturate = true); /// Returns a predicate comparing two floats Node GetPredicateComparisonFloat(Tegra::Shader::PredCondition condition, Node op_a, Node op_b); @@ -745,8 +749,7 @@ private: Node GetPredicateComparisonInteger(Tegra::Shader::PredCondition condition, bool is_signed, Node op_a, Node op_b); /// Returns a predicate comparing two half floats. meta consumes how both pairs will be compared - Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, - const MetaHalfArithmetic& meta, Node op_a, Node op_b); + Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a, Node op_b); /// Returns a predicate combiner operation OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation); diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 995d0e068..217805386 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -288,6 +288,29 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 } } +void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y, + const u32 block_height, const std::size_t copy_size, const u8* source_data, + u8* swizzle_data) { + const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x}; + std::size_t count = 0; + for (std::size_t y = dst_y; y < height && count < copy_size; ++y) { + const std::size_t gob_address_y = + (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + + ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size; + const auto& table = legacy_swizzle_table[y % gob_size_y]; + for (std::size_t x = dst_x; x < width && count < copy_size; ++x) { + const std::size_t gob_address = + gob_address_y + (x / gob_size_x) * gob_size * block_height; + const std::size_t swizzled_offset = gob_address + table[x % gob_size_x]; + const u8* source_line = source_data + count; + u8* dest_addr = swizzle_data + swizzled_offset; + count++; + + std::memcpy(dest_addr, source_line, 1); + } + } +} + std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width, u32 height) { std::vector<u8> rgba_data; diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index e078fa274..e072d8401 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -51,4 +51,8 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height, u32 offset_x, u32 offset_y); +void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y, + const u32 block_height, const std::size_t copy_size, const u8* source_data, + u8* swizzle_data); + } // namespace Tegra::Texture diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index cb82ecf3f..60cda0ca3 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -5,6 +5,8 @@ #include <memory> #include "core/core.h" #include "core/settings.h" +#include "video_core/gpu_asynch.h" +#include "video_core/gpu_synch.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/renderer_opengl.h" #include "video_core/video_core.h" @@ -16,6 +18,14 @@ std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_wind return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system); } +std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system) { + if (Settings::values.use_asynchronous_gpu_emulation) { + return std::make_unique<VideoCommon::GPUAsynch>(system, system.Renderer()); + } + + return std::make_unique<VideoCommon::GPUSynch>(system, system.Renderer()); +} + u16 GetResolutionScaleFactor(const RendererBase& renderer) { return static_cast<u16>( Settings::values.resolution_factor diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h index 3c583f195..b8e0ac372 100644 --- a/src/video_core/video_core.h +++ b/src/video_core/video_core.h @@ -14,6 +14,10 @@ namespace Core::Frontend { class EmuWindow; } +namespace Tegra { +class GPU; +} + namespace VideoCore { class RendererBase; @@ -27,6 +31,9 @@ class RendererBase; std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window, Core::System& system); +/// Creates an emulated GPU instance using the given system context. +std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system); + u16 GetResolutionScaleFactor(const RendererBase& renderer); } // namespace VideoCore diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp index c29f2d2dc..7eed9fcf3 100644 --- a/src/yuzu/bootmanager.cpp +++ b/src/yuzu/bootmanager.cpp @@ -91,8 +91,8 @@ void EmuThread::run() { class GGLContext : public Core::Frontend::GraphicsContext { public: - explicit GGLContext(QOpenGLContext* shared_context) : surface() { - context = std::make_unique<QOpenGLContext>(shared_context); + explicit GGLContext(QOpenGLContext* shared_context) + : context{std::make_unique<QOpenGLContext>(shared_context)} { surface.setFormat(shared_context->format()); surface.create(); } @@ -186,8 +186,7 @@ private: }; GRenderWindow::GRenderWindow(QWidget* parent, EmuThread* emu_thread) - : QWidget(parent), child(nullptr), context(nullptr), emu_thread(emu_thread) { - + : QWidget(parent), emu_thread(emu_thread) { setWindowTitle(QStringLiteral("yuzu %1 | %2-%3") .arg(Common::g_build_name, Common::g_scm_branch, Common::g_scm_desc)); setAttribute(Qt::WA_AcceptTouchEvents); diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h index 9608b959f..3df33aca1 100644 --- a/src/yuzu/bootmanager.h +++ b/src/yuzu/bootmanager.h @@ -10,7 +10,6 @@ #include <QImage> #include <QThread> #include <QWidget> -#include "common/thread.h" #include "core/core.h" #include "core/frontend/emu_window.h" |