diff options
Diffstat (limited to '')
-rw-r--r-- | src/core/arm/nce/arm_nce.cpp | 400 | ||||
-rw-r--r-- | src/core/arm/nce/arm_nce.h | 108 | ||||
-rw-r--r-- | src/core/arm/nce/arm_nce.s | 222 | ||||
-rw-r--r-- | src/core/arm/nce/arm_nce_asm_definitions.h | 29 | ||||
-rw-r--r-- | src/core/arm/nce/guest_context.h | 50 | ||||
-rw-r--r-- | src/core/arm/nce/instructions.h | 147 | ||||
-rw-r--r-- | src/core/arm/nce/patcher.cpp | 474 | ||||
-rw-r--r-- | src/core/arm/nce/patcher.h | 98 |
8 files changed, 1528 insertions, 0 deletions
diff --git a/src/core/arm/nce/arm_nce.cpp b/src/core/arm/nce/arm_nce.cpp new file mode 100644 index 000000000..f7bdafd39 --- /dev/null +++ b/src/core/arm/nce/arm_nce.cpp @@ -0,0 +1,400 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <cinttypes> +#include <memory> + +#include "common/signal_chain.h" +#include "core/arm/nce/arm_nce.h" +#include "core/arm/nce/patcher.h" +#include "core/core.h" +#include "core/memory.h" + +#include "core/hle/kernel/k_process.h" + +#include <signal.h> +#include <sys/syscall.h> +#include <unistd.h> + +namespace Core { + +namespace { + +struct sigaction g_orig_action; + +// Verify assembly offsets. +using NativeExecutionParameters = Kernel::KThread::NativeExecutionParameters; +static_assert(offsetof(NativeExecutionParameters, native_context) == TpidrEl0NativeContext); +static_assert(offsetof(NativeExecutionParameters, lock) == TpidrEl0Lock); +static_assert(offsetof(NativeExecutionParameters, magic) == TpidrEl0TlsMagic); + +fpsimd_context* GetFloatingPointState(mcontext_t& host_ctx) { + _aarch64_ctx* header = reinterpret_cast<_aarch64_ctx*>(&host_ctx.__reserved); + while (header->magic != FPSIMD_MAGIC) { + header = reinterpret_cast<_aarch64_ctx*>(reinterpret_cast<char*>(header) + header->size); + } + return reinterpret_cast<fpsimd_context*>(header); +} + +} // namespace + +void* ARM_NCE::RestoreGuestContext(void* raw_context) { + // Retrieve the host context. + auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext; + + // Thread-local parameters will be located in x9. + auto* tpidr = reinterpret_cast<NativeExecutionParameters*>(host_ctx.regs[9]); + auto* guest_ctx = static_cast<GuestContext*>(tpidr->native_context); + + // Retrieve the host floating point state. + auto* fpctx = GetFloatingPointState(host_ctx); + + // Save host callee-saved registers. + std::memcpy(guest_ctx->host_ctx.host_saved_vregs.data(), &fpctx->vregs[8], + sizeof(guest_ctx->host_ctx.host_saved_vregs)); + std::memcpy(guest_ctx->host_ctx.host_saved_regs.data(), &host_ctx.regs[19], + sizeof(guest_ctx->host_ctx.host_saved_regs)); + + // Save stack pointer. + guest_ctx->host_ctx.host_sp = host_ctx.sp; + + // Restore all guest state except tpidr_el0. + host_ctx.sp = guest_ctx->sp; + host_ctx.pc = guest_ctx->pc; + host_ctx.pstate = guest_ctx->pstate; + fpctx->fpcr = guest_ctx->fpcr; + fpctx->fpsr = guest_ctx->fpsr; + std::memcpy(host_ctx.regs, guest_ctx->cpu_registers.data(), sizeof(host_ctx.regs)); + std::memcpy(fpctx->vregs, guest_ctx->vector_registers.data(), sizeof(fpctx->vregs)); + + // Return the new thread-local storage pointer. + return tpidr; +} + +void ARM_NCE::SaveGuestContext(GuestContext* guest_ctx, void* raw_context) { + // Retrieve the host context. + auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext; + + // Retrieve the host floating point state. + auto* fpctx = GetFloatingPointState(host_ctx); + + // Save all guest registers except tpidr_el0. + std::memcpy(guest_ctx->cpu_registers.data(), host_ctx.regs, sizeof(host_ctx.regs)); + std::memcpy(guest_ctx->vector_registers.data(), fpctx->vregs, sizeof(fpctx->vregs)); + guest_ctx->fpsr = fpctx->fpsr; + guest_ctx->fpcr = fpctx->fpcr; + guest_ctx->pstate = static_cast<u32>(host_ctx.pstate); + guest_ctx->pc = host_ctx.pc; + guest_ctx->sp = host_ctx.sp; + + // Restore stack pointer. + host_ctx.sp = guest_ctx->host_ctx.host_sp; + + // Restore host callee-saved registers. + std::memcpy(&host_ctx.regs[19], guest_ctx->host_ctx.host_saved_regs.data(), + sizeof(guest_ctx->host_ctx.host_saved_regs)); + std::memcpy(&fpctx->vregs[8], guest_ctx->host_ctx.host_saved_vregs.data(), + sizeof(guest_ctx->host_ctx.host_saved_vregs)); + + // Return from the call on exit by setting pc to x30. + host_ctx.pc = guest_ctx->host_ctx.host_saved_regs[11]; + + // Clear esr_el1 and return it. + host_ctx.regs[0] = guest_ctx->esr_el1.exchange(0); +} + +bool ARM_NCE::HandleGuestFault(GuestContext* guest_ctx, void* raw_info, void* raw_context) { + auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext; + auto* info = static_cast<siginfo_t*>(raw_info); + + // Try to handle an invalid access. + // TODO: handle accesses which split a page? + const Common::ProcessAddress addr = + (reinterpret_cast<u64>(info->si_addr) & ~Memory::YUZU_PAGEMASK); + if (guest_ctx->system->ApplicationMemory().InvalidateNCE(addr, Memory::YUZU_PAGESIZE)) { + // We handled the access successfully and are returning to guest code. + return true; + } + + // We can't handle the access, so determine why we crashed. + const bool is_prefetch_abort = host_ctx.pc == reinterpret_cast<u64>(info->si_addr); + + // For data aborts, skip the instruction and return to guest code. + // This will allow games to continue in many scenarios where they would otherwise crash. + if (!is_prefetch_abort) { + host_ctx.pc += 4; + return true; + } + + // This is a prefetch abort. + guest_ctx->esr_el1.fetch_or(static_cast<u64>(HaltReason::PrefetchAbort)); + + // Forcibly mark the context as locked. We are still running. + // We may race with SignalInterrupt here: + // - If we lose the race, then SignalInterrupt will send us a signal we are masking, + // and it will do nothing when it is unmasked, as we have already left guest code. + // - If we win the race, then SignalInterrupt will wait for us to unlock first. + auto& thread_params = guest_ctx->parent->running_thread->GetNativeExecutionParameters(); + thread_params.lock.store(SpinLockLocked); + + // Return to host. + SaveGuestContext(guest_ctx, raw_context); + return false; +} + +void ARM_NCE::HandleHostFault(int sig, void* raw_info, void* raw_context) { + return g_orig_action.sa_sigaction(sig, static_cast<siginfo_t*>(raw_info), raw_context); +} + +HaltReason ARM_NCE::RunJit() { + // Get the thread parameters. + // TODO: pass the current thread down from ::Run + auto* thread = Kernel::GetCurrentThreadPointer(system.Kernel()); + auto* thread_params = &thread->GetNativeExecutionParameters(); + + { + // Lock our core context. + std::scoped_lock lk{lock}; + + // We should not be running. + ASSERT(running_thread == nullptr); + + // Check if we need to run. If we have already been halted, we are done. + u64 halt = guest_ctx.esr_el1.exchange(0); + if (halt != 0) { + return static_cast<HaltReason>(halt); + } + + // Mark that we are running. + running_thread = thread; + + // Acquire the lock on the thread parameters. + // This allows us to force synchronization with SignalInterrupt. + LockThreadParameters(thread_params); + } + + // Assign current members. + guest_ctx.parent = this; + thread_params->native_context = &guest_ctx; + thread_params->tpidr_el0 = guest_ctx.tpidr_el0; + thread_params->tpidrro_el0 = guest_ctx.tpidrro_el0; + thread_params->is_running = true; + + HaltReason halt{}; + + // TODO: finding and creating the post handler needs to be locked + // to deal with dynamic loading of NROs. + const auto& post_handlers = system.ApplicationProcess()->GetPostHandlers(); + if (auto it = post_handlers.find(guest_ctx.pc); it != post_handlers.end()) { + halt = ReturnToRunCodeByTrampoline(thread_params, &guest_ctx, it->second); + } else { + halt = ReturnToRunCodeByExceptionLevelChange(thread_id, thread_params); + } + + // Unload members. + // The thread does not change, so we can persist the old reference. + guest_ctx.tpidr_el0 = thread_params->tpidr_el0; + thread_params->native_context = nullptr; + thread_params->is_running = false; + + // Unlock the thread parameters. + UnlockThreadParameters(thread_params); + + { + // Lock the core context. + std::scoped_lock lk{lock}; + + // On exit, we no longer have an active thread. + running_thread = nullptr; + } + + // Return the halt reason. + return halt; +} + +HaltReason ARM_NCE::StepJit() { + return HaltReason::StepThread; +} + +u32 ARM_NCE::GetSvcNumber() const { + return guest_ctx.svc_swi; +} + +ARM_NCE::ARM_NCE(System& system_, bool uses_wall_clock_, std::size_t core_index_) + : ARM_Interface{system_, uses_wall_clock_}, core_index{core_index_} { + guest_ctx.system = &system_; +} + +ARM_NCE::~ARM_NCE() = default; + +void ARM_NCE::Initialize() { + thread_id = gettid(); + + // Setup our signals + static std::once_flag flag; + std::call_once(flag, [] { + using HandlerType = decltype(sigaction::sa_sigaction); + + sigset_t signal_mask; + sigemptyset(&signal_mask); + sigaddset(&signal_mask, ReturnToRunCodeByExceptionLevelChangeSignal); + sigaddset(&signal_mask, BreakFromRunCodeSignal); + sigaddset(&signal_mask, GuestFaultSignal); + + struct sigaction return_to_run_code_action {}; + return_to_run_code_action.sa_flags = SA_SIGINFO | SA_ONSTACK; + return_to_run_code_action.sa_sigaction = reinterpret_cast<HandlerType>( + &ARM_NCE::ReturnToRunCodeByExceptionLevelChangeSignalHandler); + return_to_run_code_action.sa_mask = signal_mask; + Common::SigAction(ReturnToRunCodeByExceptionLevelChangeSignal, &return_to_run_code_action, + nullptr); + + struct sigaction break_from_run_code_action {}; + break_from_run_code_action.sa_flags = SA_SIGINFO | SA_ONSTACK; + break_from_run_code_action.sa_sigaction = + reinterpret_cast<HandlerType>(&ARM_NCE::BreakFromRunCodeSignalHandler); + break_from_run_code_action.sa_mask = signal_mask; + Common::SigAction(BreakFromRunCodeSignal, &break_from_run_code_action, nullptr); + + struct sigaction fault_action {}; + fault_action.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART; + fault_action.sa_sigaction = + reinterpret_cast<HandlerType>(&ARM_NCE::GuestFaultSignalHandler); + fault_action.sa_mask = signal_mask; + Common::SigAction(GuestFaultSignal, &fault_action, &g_orig_action); + + // Simplify call for g_orig_action. + // These fields occupy the same space in memory, so this should be a no-op in practice. + if (!(g_orig_action.sa_flags & SA_SIGINFO)) { + g_orig_action.sa_sigaction = + reinterpret_cast<decltype(g_orig_action.sa_sigaction)>(g_orig_action.sa_handler); + } + }); +} + +void ARM_NCE::SetPC(u64 pc) { + guest_ctx.pc = pc; +} + +u64 ARM_NCE::GetPC() const { + return guest_ctx.pc; +} + +u64 ARM_NCE::GetSP() const { + return guest_ctx.sp; +} + +u64 ARM_NCE::GetReg(int index) const { + return guest_ctx.cpu_registers[index]; +} + +void ARM_NCE::SetReg(int index, u64 value) { + guest_ctx.cpu_registers[index] = value; +} + +u128 ARM_NCE::GetVectorReg(int index) const { + return guest_ctx.vector_registers[index]; +} + +void ARM_NCE::SetVectorReg(int index, u128 value) { + guest_ctx.vector_registers[index] = value; +} + +u32 ARM_NCE::GetPSTATE() const { + return guest_ctx.pstate; +} + +void ARM_NCE::SetPSTATE(u32 pstate) { + guest_ctx.pstate = pstate; +} + +u64 ARM_NCE::GetTlsAddress() const { + return guest_ctx.tpidrro_el0; +} + +void ARM_NCE::SetTlsAddress(u64 address) { + guest_ctx.tpidrro_el0 = address; +} + +u64 ARM_NCE::GetTPIDR_EL0() const { + return guest_ctx.tpidr_el0; +} + +void ARM_NCE::SetTPIDR_EL0(u64 value) { + guest_ctx.tpidr_el0 = value; +} + +void ARM_NCE::SaveContext(ThreadContext64& ctx) const { + ctx.cpu_registers = guest_ctx.cpu_registers; + ctx.sp = guest_ctx.sp; + ctx.pc = guest_ctx.pc; + ctx.pstate = guest_ctx.pstate; + ctx.vector_registers = guest_ctx.vector_registers; + ctx.fpcr = guest_ctx.fpcr; + ctx.fpsr = guest_ctx.fpsr; + ctx.tpidr = guest_ctx.tpidr_el0; +} + +void ARM_NCE::LoadContext(const ThreadContext64& ctx) { + guest_ctx.cpu_registers = ctx.cpu_registers; + guest_ctx.sp = ctx.sp; + guest_ctx.pc = ctx.pc; + guest_ctx.pstate = ctx.pstate; + guest_ctx.vector_registers = ctx.vector_registers; + guest_ctx.fpcr = ctx.fpcr; + guest_ctx.fpsr = ctx.fpsr; + guest_ctx.tpidr_el0 = ctx.tpidr; +} + +void ARM_NCE::SignalInterrupt() { + // Lock core context. + std::scoped_lock lk{lock}; + + // Add break loop condition. + guest_ctx.esr_el1.fetch_or(static_cast<u64>(HaltReason::BreakLoop)); + + // If there is no thread running, we are done. + if (running_thread == nullptr) { + return; + } + + // Lock the thread context. + auto* params = &running_thread->GetNativeExecutionParameters(); + LockThreadParameters(params); + + if (params->is_running) { + // We should signal to the running thread. + // The running thread will unlock the thread context. + syscall(SYS_tkill, thread_id, BreakFromRunCodeSignal); + } else { + // If the thread is no longer running, we have nothing to do. + UnlockThreadParameters(params); + } +} + +void ARM_NCE::ClearInterrupt() { + guest_ctx.esr_el1 = {}; +} + +void ARM_NCE::ClearInstructionCache() { + // TODO: This is not possible to implement correctly on Linux because + // we do not have any access to ic iallu. + + // Require accesses to complete. + std::atomic_thread_fence(std::memory_order_seq_cst); +} + +void ARM_NCE::InvalidateCacheRange(u64 addr, std::size_t size) { + this->ClearInstructionCache(); +} + +void ARM_NCE::ClearExclusiveState() { + // No-op. +} + +void ARM_NCE::PageTableChanged(Common::PageTable& page_table, + std::size_t new_address_space_size_in_bits) { + // No-op. Page table is never used. +} + +} // namespace Core diff --git a/src/core/arm/nce/arm_nce.h b/src/core/arm/nce/arm_nce.h new file mode 100644 index 000000000..5fbd6dbf3 --- /dev/null +++ b/src/core/arm/nce/arm_nce.h @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include <atomic> +#include <memory> +#include <span> +#include <unordered_map> +#include <vector> + +#include "core/arm/arm_interface.h" +#include "core/arm/nce/guest_context.h" + +namespace Core::Memory { +class Memory; +} + +namespace Core { + +class System; + +class ARM_NCE final : public ARM_Interface { +public: + ARM_NCE(System& system_, bool uses_wall_clock_, std::size_t core_index_); + + ~ARM_NCE() override; + + void Initialize() override; + void SetPC(u64 pc) override; + u64 GetPC() const override; + u64 GetSP() const override; + u64 GetReg(int index) const override; + void SetReg(int index, u64 value) override; + u128 GetVectorReg(int index) const override; + void SetVectorReg(int index, u128 value) override; + + u32 GetPSTATE() const override; + void SetPSTATE(u32 pstate) override; + u64 GetTlsAddress() const override; + void SetTlsAddress(u64 address) override; + void SetTPIDR_EL0(u64 value) override; + u64 GetTPIDR_EL0() const override; + + Architecture GetArchitecture() const override { + return Architecture::Aarch64; + } + + void SaveContext(ThreadContext32& ctx) const override {} + void SaveContext(ThreadContext64& ctx) const override; + void LoadContext(const ThreadContext32& ctx) override {} + void LoadContext(const ThreadContext64& ctx) override; + + void SignalInterrupt() override; + void ClearInterrupt() override; + void ClearExclusiveState() override; + void ClearInstructionCache() override; + void InvalidateCacheRange(u64 addr, std::size_t size) override; + void PageTableChanged(Common::PageTable& new_page_table, + std::size_t new_address_space_size_in_bits) override; + +protected: + HaltReason RunJit() override; + HaltReason StepJit() override; + + u32 GetSvcNumber() const override; + + const Kernel::DebugWatchpoint* HaltedWatchpoint() const override { + return nullptr; + } + + void RewindBreakpointInstruction() override {} + +private: + // Assembly definitions. + static HaltReason ReturnToRunCodeByTrampoline(void* tpidr, GuestContext* ctx, + u64 trampoline_addr); + static HaltReason ReturnToRunCodeByExceptionLevelChange(int tid, void* tpidr); + + static void ReturnToRunCodeByExceptionLevelChangeSignalHandler(int sig, void* info, + void* raw_context); + static void BreakFromRunCodeSignalHandler(int sig, void* info, void* raw_context); + static void GuestFaultSignalHandler(int sig, void* info, void* raw_context); + + static void LockThreadParameters(void* tpidr); + static void UnlockThreadParameters(void* tpidr); + +private: + // C++ implementation functions for assembly definitions. + static void* RestoreGuestContext(void* raw_context); + static void SaveGuestContext(GuestContext* ctx, void* raw_context); + static bool HandleGuestFault(GuestContext* ctx, void* info, void* raw_context); + static void HandleHostFault(int sig, void* info, void* raw_context); + +public: + // Members set on initialization. + std::size_t core_index{}; + pid_t thread_id{-1}; + + // Core context. + GuestContext guest_ctx; + + // Thread and invalidation info. + std::mutex lock; + Kernel::KThread* running_thread{}; +}; + +} // namespace Core diff --git a/src/core/arm/nce/arm_nce.s b/src/core/arm/nce/arm_nce.s new file mode 100644 index 000000000..b98e09f31 --- /dev/null +++ b/src/core/arm/nce/arm_nce.s @@ -0,0 +1,222 @@ +/* SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project */ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "core/arm/nce/arm_nce_asm_definitions.h" + +#define LOAD_IMMEDIATE_32(reg, val) \ + mov reg, #(((val) >> 0x00) & 0xFFFF); \ + movk reg, #(((val) >> 0x10) & 0xFFFF), lsl #16 + + +/* static HaltReason Core::ARM_NCE::ReturnToRunCodeByTrampoline(void* tpidr, Core::GuestContext* ctx, u64 trampoline_addr) */ +.section .text._ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm, "ax", %progbits +.global _ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm +.type _ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm, %function +_ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm: + /* Back up host sp to x3. */ + /* Back up host tpidr_el0 to x4. */ + mov x3, sp + mrs x4, tpidr_el0 + + /* Load guest sp. x5 is used as a scratch register. */ + ldr x5, [x1, #(GuestContextSp)] + mov sp, x5 + + /* Offset GuestContext pointer to the host member. */ + add x5, x1, #(GuestContextHostContext) + + /* Save original host sp and tpidr_el0 (x3, x4) to host context. */ + stp x3, x4, [x5, #(HostContextSpTpidrEl0)] + + /* Save all callee-saved host GPRs. */ + stp x19, x20, [x5, #(HostContextRegs+0x0)] + stp x21, x22, [x5, #(HostContextRegs+0x10)] + stp x23, x24, [x5, #(HostContextRegs+0x20)] + stp x25, x26, [x5, #(HostContextRegs+0x30)] + stp x27, x28, [x5, #(HostContextRegs+0x40)] + stp x29, x30, [x5, #(HostContextRegs+0x50)] + + /* Save all callee-saved host FPRs. */ + stp q8, q9, [x5, #(HostContextVregs+0x0)] + stp q10, q11, [x5, #(HostContextVregs+0x20)] + stp q12, q13, [x5, #(HostContextVregs+0x40)] + stp q14, q15, [x5, #(HostContextVregs+0x60)] + + /* Load guest tpidr_el0 from argument. */ + msr tpidr_el0, x0 + + /* Tail call the trampoline to restore guest state. */ + br x2 + + +/* static HaltReason Core::ARM_NCE::ReturnToRunCodeByExceptionLevelChange(int tid, void* tpidr) */ +.section .text._ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv, "ax", %progbits +.global _ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv +.type _ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv, %function +_ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv: + /* This jumps to the signal handler, which will restore the entire context. */ + /* On entry, x0 = thread id, which is already in the right place. */ + + /* Move tpidr to x9 so it is not trampled. */ + mov x9, x1 + + /* Set up arguments. */ + mov x8, #(__NR_tkill) + mov x1, #(ReturnToRunCodeByExceptionLevelChangeSignal) + + /* Tail call the signal handler. */ + svc #0 + + /* Block execution from flowing here. */ + brk #1000 + + +/* static void Core::ARM_NCE::ReturnToRunCodeByExceptionLevelChangeSignalHandler(int sig, void* info, void* raw_context) */ +.section .text._ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_, "ax", %progbits +.global _ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_ +.type _ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_, %function +_ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_: + stp x29, x30, [sp, #-0x10]! + mov x29, sp + + /* Call the context restorer with the raw context. */ + mov x0, x2 + bl _ZN4Core7ARM_NCE19RestoreGuestContextEPv + + /* Save the old value of tpidr_el0. */ + mrs x8, tpidr_el0 + ldr x9, [x0, #(TpidrEl0NativeContext)] + str x8, [x9, #(GuestContextHostContext + HostContextTpidrEl0)] + + /* Set our new tpidr_el0. */ + msr tpidr_el0, x0 + + /* Unlock the context. */ + bl _ZN4Core7ARM_NCE22UnlockThreadParametersEPv + + /* Returning from here will enter the guest. */ + ldp x29, x30, [sp], #0x10 + ret + + +/* static void Core::ARM_NCE::BreakFromRunCodeSignalHandler(int sig, void* info, void* raw_context) */ +.section .text._ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_, "ax", %progbits +.global _ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_ +.type _ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_, %function +_ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_: + /* Check to see if we have the correct TLS magic. */ + mrs x8, tpidr_el0 + ldr w9, [x8, #(TpidrEl0TlsMagic)] + + LOAD_IMMEDIATE_32(w10, TlsMagic) + + cmp w9, w10 + b.ne 1f + + /* Correct TLS magic, so this is a guest interrupt. */ + /* Restore host tpidr_el0. */ + ldr x0, [x8, #(TpidrEl0NativeContext)] + ldr x3, [x0, #(GuestContextHostContext + HostContextTpidrEl0)] + msr tpidr_el0, x3 + + /* Tail call the restorer. */ + mov x1, x2 + b _ZN4Core7ARM_NCE16SaveGuestContextEPNS_12GuestContextEPv + + /* Returning from here will enter host code. */ + +1: + /* Incorrect TLS magic, so this is a spurious signal. */ + ret + + +/* static void Core::ARM_NCE::GuestFaultSignalHandler(int sig, void* info, void* raw_context) */ +.section .text._ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_, "ax", %progbits +.global _ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_ +.type _ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_, %function +_ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_: + /* Check to see if we have the correct TLS magic. */ + mrs x8, tpidr_el0 + ldr w9, [x8, #(TpidrEl0TlsMagic)] + + LOAD_IMMEDIATE_32(w10, TlsMagic) + + cmp w9, w10 + b.eq 1f + + /* Incorrect TLS magic, so this is a host fault. */ + /* Tail call the handler. */ + b _ZN4Core7ARM_NCE15HandleHostFaultEiPvS1_ + +1: + /* Correct TLS magic, so this is a guest fault. */ + stp x29, x30, [sp, #-0x20]! + str x19, [sp, #0x10] + mov x29, sp + + /* Save the old tpidr_el0. */ + mov x19, x8 + + /* Restore host tpidr_el0. */ + ldr x0, [x8, #(TpidrEl0NativeContext)] + ldr x3, [x0, #(GuestContextHostContext + HostContextTpidrEl0)] + msr tpidr_el0, x3 + + /* Call the handler. */ + bl _ZN4Core7ARM_NCE16HandleGuestFaultEPNS_12GuestContextEPvS3_ + + /* If the handler returned false, we want to preserve the host tpidr_el0. */ + cbz x0, 2f + + /* Otherwise, restore guest tpidr_el0. */ + msr tpidr_el0, x19 + +2: + ldr x19, [sp, #0x10] + ldp x29, x30, [sp], #0x20 + ret + + +/* static void Core::ARM_NCE::LockThreadParameters(void* tpidr) */ +.section .text._ZN4Core7ARM_NCE20LockThreadParametersEPv, "ax", %progbits +.global _ZN4Core7ARM_NCE20LockThreadParametersEPv +.type _ZN4Core7ARM_NCE20LockThreadParametersEPv, %function +_ZN4Core7ARM_NCE20LockThreadParametersEPv: + /* Offset to lock member. */ + add x0, x0, #(TpidrEl0Lock) + +1: + /* Clear the monitor. */ + clrex + +2: + /* Load-linked with acquire ordering. */ + ldaxr w1, [x0] + + /* If the value was SpinLockLocked, clear monitor and retry. */ + cbz w1, 1b + + /* Store-conditional SpinLockLocked with relaxed ordering. */ + stxr w1, wzr, [x0] + + /* If we failed to store, retry. */ + cbnz w1, 2b + + ret + + +/* static void Core::ARM_NCE::UnlockThreadParameters(void* tpidr) */ +.section .text._ZN4Core7ARM_NCE22UnlockThreadParametersEPv, "ax", %progbits +.global _ZN4Core7ARM_NCE22UnlockThreadParametersEPv +.type _ZN4Core7ARM_NCE22UnlockThreadParametersEPv, %function +_ZN4Core7ARM_NCE22UnlockThreadParametersEPv: + /* Offset to lock member. */ + add x0, x0, #(TpidrEl0Lock) + + /* Load SpinLockUnlocked. */ + mov w1, #(SpinLockUnlocked) + + /* Store value with release ordering. */ + stlr w1, [x0] + + ret diff --git a/src/core/arm/nce/arm_nce_asm_definitions.h b/src/core/arm/nce/arm_nce_asm_definitions.h new file mode 100644 index 000000000..8a9b285b5 --- /dev/null +++ b/src/core/arm/nce/arm_nce_asm_definitions.h @@ -0,0 +1,29 @@ +/* SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project */ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#pragma once + +#define __ASSEMBLY__ + +#include <asm-generic/signal.h> +#include <asm-generic/unistd.h> + +#define ReturnToRunCodeByExceptionLevelChangeSignal SIGUSR2 +#define BreakFromRunCodeSignal SIGURG +#define GuestFaultSignal SIGSEGV + +#define GuestContextSp 0xF8 +#define GuestContextHostContext 0x320 + +#define HostContextSpTpidrEl0 0xE0 +#define HostContextTpidrEl0 0xE8 +#define HostContextRegs 0x0 +#define HostContextVregs 0x60 + +#define TpidrEl0NativeContext 0x10 +#define TpidrEl0Lock 0x18 +#define TpidrEl0TlsMagic 0x20 +#define TlsMagic 0x555a5559 + +#define SpinLockLocked 0 +#define SpinLockUnlocked 1 diff --git a/src/core/arm/nce/guest_context.h b/src/core/arm/nce/guest_context.h new file mode 100644 index 000000000..0767a0337 --- /dev/null +++ b/src/core/arm/nce/guest_context.h @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "core/arm/arm_interface.h" +#include "core/arm/nce/arm_nce_asm_definitions.h" + +namespace Core { + +class ARM_NCE; +class System; + +struct HostContext { + alignas(16) std::array<u64, 12> host_saved_regs{}; + alignas(16) std::array<u128, 8> host_saved_vregs{}; + u64 host_sp{}; + void* host_tpidr_el0{}; +}; + +struct GuestContext { + std::array<u64, 31> cpu_registers{}; + u64 sp{}; + u64 pc{}; + u32 fpcr{}; + u32 fpsr{}; + std::array<u128, 32> vector_registers{}; + u32 pstate{}; + alignas(16) HostContext host_ctx{}; + u64 tpidrro_el0{}; + u64 tpidr_el0{}; + std::atomic<u64> esr_el1{}; + u32 nzcv{}; + u32 svc_swi{}; + System* system{}; + ARM_NCE* parent{}; +}; + +// Verify assembly offsets. +static_assert(offsetof(GuestContext, sp) == GuestContextSp); +static_assert(offsetof(GuestContext, host_ctx) == GuestContextHostContext); +static_assert(offsetof(HostContext, host_sp) == HostContextSpTpidrEl0); +static_assert(offsetof(HostContext, host_tpidr_el0) - 8 == HostContextSpTpidrEl0); +static_assert(offsetof(HostContext, host_tpidr_el0) == HostContextTpidrEl0); +static_assert(offsetof(HostContext, host_saved_regs) == HostContextRegs); +static_assert(offsetof(HostContext, host_saved_vregs) == HostContextVregs); + +} // namespace Core diff --git a/src/core/arm/nce/instructions.h b/src/core/arm/nce/instructions.h new file mode 100644 index 000000000..5b56ff857 --- /dev/null +++ b/src/core/arm/nce/instructions.h @@ -0,0 +1,147 @@ +// SPDX-FileCopyrightText: Copyright © 2020 Skyline Team and Contributors +// SPDX-License-Identifier: MPL-2.0 + +#include "common/bit_field.h" +#include "common/common_types.h" + +namespace Core::NCE { + +enum SystemRegister : u32 { + TpidrEl0 = 0x5E82, + TpidrroEl0 = 0x5E83, + CntfrqEl0 = 0x5F00, + CntpctEl0 = 0x5F01, +}; + +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SVC--Supervisor-Call- +union SVC { + constexpr explicit SVC(u32 raw_) : raw{raw_} {} + + constexpr bool Verify() { + return (this->GetSig0() == 0x1 && this->GetSig1() == 0x6A0); + } + + constexpr u32 GetSig0() { + return decltype(sig0)::ExtractValue(raw); + } + + constexpr u32 GetValue() { + return decltype(value)::ExtractValue(raw); + } + + constexpr u32 GetSig1() { + return decltype(sig1)::ExtractValue(raw); + } + + u32 raw; + +private: + BitField<0, 5, u32> sig0; // 0x1 + BitField<5, 16, u32> value; // 16-bit immediate + BitField<21, 11, u32> sig1; // 0x6A0 +}; +static_assert(sizeof(SVC) == sizeof(u32)); +static_assert(SVC(0xD40000C1).Verify()); +static_assert(SVC(0xD40000C1).GetValue() == 0x6); + +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MRS--Move-System-Register- +union MRS { + constexpr explicit MRS(u32 raw_) : raw{raw_} {} + + constexpr bool Verify() { + return (this->GetSig() == 0xD53); + } + + constexpr u32 GetRt() { + return decltype(rt)::ExtractValue(raw); + } + + constexpr u32 GetSystemReg() { + return decltype(system_reg)::ExtractValue(raw); + } + + constexpr u32 GetSig() { + return decltype(sig)::ExtractValue(raw); + } + + u32 raw; + +private: + BitField<0, 5, u32> rt; // destination register + BitField<5, 15, u32> system_reg; // source system register + BitField<20, 12, u32> sig; // 0xD53 +}; +static_assert(sizeof(MRS) == sizeof(u32)); +static_assert(MRS(0xD53BE020).Verify()); +static_assert(MRS(0xD53BE020).GetSystemReg() == CntpctEl0); +static_assert(MRS(0xD53BE020).GetRt() == 0x0); + +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MSR--register---Move-general-purpose-register-to-System-Register- +union MSR { + constexpr explicit MSR(u32 raw_) : raw{raw_} {} + + constexpr bool Verify() { + return this->GetSig() == 0xD51; + } + + constexpr u32 GetRt() { + return decltype(rt)::ExtractValue(raw); + } + + constexpr u32 GetSystemReg() { + return decltype(system_reg)::ExtractValue(raw); + } + + constexpr u32 GetSig() { + return decltype(sig)::ExtractValue(raw); + } + + u32 raw; + +private: + BitField<0, 5, u32> rt; // source register + BitField<5, 15, u32> system_reg; // destination system register + BitField<20, 12, u32> sig; // 0xD51 +}; +static_assert(sizeof(MSR) == sizeof(u32)); +static_assert(MSR(0xD51BD040).Verify()); +static_assert(MSR(0xD51BD040).GetSystemReg() == TpidrEl0); +static_assert(MSR(0xD51BD040).GetRt() == 0x0); + +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXR--Load-Exclusive-Register- +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXP--Load-Exclusive-Pair-of-Registers- +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXR--Store-Exclusive-Register- +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXP--Store-Exclusive-Pair-of-registers- +union Exclusive { + constexpr explicit Exclusive(u32 raw_) : raw{raw_} {} + + constexpr bool Verify() { + return this->GetSig() == 0x10; + } + + constexpr u32 GetSig() { + return decltype(sig)::ExtractValue(raw); + } + + constexpr u32 AsOrdered() { + return raw | decltype(o0)::FormatValue(1); + } + + u32 raw; + +private: + BitField<0, 5, u32> rt; // memory operand + BitField<5, 5, u32> rn; // register operand 1 + BitField<10, 5, u32> rt2; // register operand 2 + BitField<15, 1, u32> o0; // ordered + BitField<16, 5, u32> rs; // status register + BitField<21, 2, u32> l; // operation type + BitField<23, 7, u32> sig; // 0x10 + BitField<30, 2, u32> size; // size +}; +static_assert(Exclusive(0xC85FFC00).Verify()); +static_assert(Exclusive(0xC85FFC00).AsOrdered() == 0xC85FFC00); +static_assert(Exclusive(0xC85F7C00).AsOrdered() == 0xC85FFC00); +static_assert(Exclusive(0xC8200440).AsOrdered() == 0xC8208440); + +} // namespace Core::NCE diff --git a/src/core/arm/nce/patcher.cpp b/src/core/arm/nce/patcher.cpp new file mode 100644 index 000000000..ec8527224 --- /dev/null +++ b/src/core/arm/nce/patcher.cpp @@ -0,0 +1,474 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/arm64/native_clock.h" +#include "common/bit_cast.h" +#include "common/literals.h" +#include "core/arm/nce/arm_nce.h" +#include "core/arm/nce/guest_context.h" +#include "core/arm/nce/instructions.h" +#include "core/arm/nce/patcher.h" +#include "core/core.h" +#include "core/core_timing.h" +#include "core/hle/kernel/svc.h" + +namespace Core::NCE { + +using namespace Common::Literals; +using namespace oaknut::util; + +using NativeExecutionParameters = Kernel::KThread::NativeExecutionParameters; + +constexpr size_t MaxRelativeBranch = 128_MiB; +constexpr u32 ModuleCodeIndex = 0x24 / sizeof(u32); + +Patcher::Patcher() : c(m_patch_instructions) {} + +Patcher::~Patcher() = default; + +void Patcher::PatchText(const Kernel::PhysicalMemory& program_image, + const Kernel::CodeSet::Segment& code) { + + // Write save context helper function. + c.l(m_save_context); + WriteSaveContext(); + + // Write load context helper function. + c.l(m_load_context); + WriteLoadContext(); + + // Retrieve text segment data. + const auto text = std::span{program_image}.subspan(code.offset, code.size); + const auto text_words = + std::span<const u32>{reinterpret_cast<const u32*>(text.data()), text.size() / sizeof(u32)}; + + // Loop through instructions, patching as needed. + for (u32 i = ModuleCodeIndex; i < static_cast<u32>(text_words.size()); i++) { + const u32 inst = text_words[i]; + + const auto AddRelocations = [&] { + const uintptr_t this_offset = i * sizeof(u32); + const uintptr_t next_offset = this_offset + sizeof(u32); + + // Relocate from here to patch. + this->BranchToPatch(this_offset); + + // Relocate from patch to next instruction. + return next_offset; + }; + + // SVC + if (auto svc = SVC{inst}; svc.Verify()) { + WriteSvcTrampoline(AddRelocations(), svc.GetValue()); + continue; + } + + // MRS Xn, TPIDR_EL0 + // MRS Xn, TPIDRRO_EL0 + if (auto mrs = MRS{inst}; + mrs.Verify() && (mrs.GetSystemReg() == TpidrroEl0 || mrs.GetSystemReg() == TpidrEl0)) { + const auto src_reg = mrs.GetSystemReg() == TpidrroEl0 ? oaknut::SystemReg::TPIDRRO_EL0 + : oaknut::SystemReg::TPIDR_EL0; + const auto dest_reg = oaknut::XReg{static_cast<int>(mrs.GetRt())}; + WriteMrsHandler(AddRelocations(), dest_reg, src_reg); + continue; + } + + // MRS Xn, CNTPCT_EL0 + if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntpctEl0) { + WriteCntpctHandler(AddRelocations(), oaknut::XReg{static_cast<int>(mrs.GetRt())}); + continue; + } + + // MRS Xn, CNTFRQ_EL0 + if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntfrqEl0) { + UNREACHABLE(); + } + + // MSR TPIDR_EL0, Xn + if (auto msr = MSR{inst}; msr.Verify() && msr.GetSystemReg() == TpidrEl0) { + WriteMsrHandler(AddRelocations(), oaknut::XReg{static_cast<int>(msr.GetRt())}); + continue; + } + + if (auto exclusive = Exclusive{inst}; exclusive.Verify()) { + m_exclusives.push_back(i); + } + } + + // Determine patching mode for the final relocation step + const size_t image_size = program_image.size(); + this->mode = image_size > MaxRelativeBranch ? PatchMode::PreText : PatchMode::PostData; +} + +void Patcher::RelocateAndCopy(Common::ProcessAddress load_base, + const Kernel::CodeSet::Segment& code, + Kernel::PhysicalMemory& program_image, + EntryTrampolines* out_trampolines) { + const size_t patch_size = GetSectionSize(); + const size_t image_size = program_image.size(); + + // Retrieve text segment data. + const auto text = std::span{program_image}.subspan(code.offset, code.size); + const auto text_words = + std::span<u32>{reinterpret_cast<u32*>(text.data()), text.size() / sizeof(u32)}; + + const auto ApplyBranchToPatchRelocation = [&](u32* target, const Relocation& rel) { + oaknut::CodeGenerator rc{target}; + if (mode == PatchMode::PreText) { + rc.B(rel.patch_offset - patch_size - rel.module_offset); + } else { + rc.B(image_size - rel.module_offset + rel.patch_offset); + } + }; + + const auto ApplyBranchToModuleRelocation = [&](u32* target, const Relocation& rel) { + oaknut::CodeGenerator rc{target}; + if (mode == PatchMode::PreText) { + rc.B(patch_size - rel.patch_offset + rel.module_offset); + } else { + rc.B(rel.module_offset - image_size - rel.patch_offset); + } + }; + + const auto RebasePatch = [&](ptrdiff_t patch_offset) { + if (mode == PatchMode::PreText) { + return GetInteger(load_base) + patch_offset; + } else { + return GetInteger(load_base) + image_size + patch_offset; + } + }; + + const auto RebasePc = [&](uintptr_t module_offset) { + if (mode == PatchMode::PreText) { + return GetInteger(load_base) + patch_size + module_offset; + } else { + return GetInteger(load_base) + module_offset; + } + }; + + // We are now ready to relocate! + for (const Relocation& rel : m_branch_to_patch_relocations) { + ApplyBranchToPatchRelocation(text_words.data() + rel.module_offset / sizeof(u32), rel); + } + for (const Relocation& rel : m_branch_to_module_relocations) { + ApplyBranchToModuleRelocation(m_patch_instructions.data() + rel.patch_offset / sizeof(u32), + rel); + } + + // Rewrite PC constants and record post trampolines + for (const Relocation& rel : m_write_module_pc_relocations) { + oaknut::CodeGenerator rc{m_patch_instructions.data() + rel.patch_offset / sizeof(u32)}; + rc.dx(RebasePc(rel.module_offset)); + } + for (const Trampoline& rel : m_trampolines) { + out_trampolines->insert({RebasePc(rel.module_offset), RebasePatch(rel.patch_offset)}); + } + + // Cortex-A57 seems to treat all exclusives as ordered, but newer processors do not. + // Convert to ordered to preserve this assumption. + for (const ModuleTextAddress i : m_exclusives) { + auto exclusive = Exclusive{text_words[i]}; + text_words[i] = exclusive.AsOrdered(); + } + + // Copy to program image + if (this->mode == PatchMode::PreText) { + std::memcpy(program_image.data(), m_patch_instructions.data(), + m_patch_instructions.size() * sizeof(u32)); + } else { + program_image.resize(image_size + patch_size); + std::memcpy(program_image.data() + image_size, m_patch_instructions.data(), + m_patch_instructions.size() * sizeof(u32)); + } +} + +size_t Patcher::GetSectionSize() const noexcept { + return Common::AlignUp(m_patch_instructions.size() * sizeof(u32), Core::Memory::YUZU_PAGESIZE); +} + +void Patcher::WriteLoadContext() { + // This function was called, which modifies X30, so use that as a scratch register. + // SP contains the guest X30, so save our return X30 to SP + 8, since we have allocated 16 bytes + // of stack. + c.STR(X30, SP, 8); + c.MRS(X30, oaknut::SystemReg::TPIDR_EL0); + c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context)); + + // Load system registers. + c.LDR(W0, X30, offsetof(GuestContext, fpsr)); + c.MSR(oaknut::SystemReg::FPSR, X0); + c.LDR(W0, X30, offsetof(GuestContext, fpcr)); + c.MSR(oaknut::SystemReg::FPCR, X0); + c.LDR(W0, X30, offsetof(GuestContext, nzcv)); + c.MSR(oaknut::SystemReg::NZCV, X0); + + // Load all vector registers. + static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers); + for (int i = 0; i <= 30; i += 2) { + c.LDP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i); + } + + // Load all general-purpose registers except X30. + for (int i = 0; i <= 28; i += 2) { + c.LDP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i); + } + + // Reload our return X30 from the stack and return. + // The patch code will reload the guest X30 for us. + c.LDR(X30, SP, 8); + c.RET(); +} + +void Patcher::WriteSaveContext() { + // This function was called, which modifies X30, so use that as a scratch register. + // SP contains the guest X30, so save our X30 to SP + 8, since we have allocated 16 bytes of + // stack. + c.STR(X30, SP, 8); + c.MRS(X30, oaknut::SystemReg::TPIDR_EL0); + c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context)); + + // Store all general-purpose registers except X30. + for (int i = 0; i <= 28; i += 2) { + c.STP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i); + } + + // Store all vector registers. + static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers); + for (int i = 0; i <= 30; i += 2) { + c.STP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i); + } + + // Store guest system registers, X30 and SP, using X0 as a scratch register. + c.STR(X0, SP, PRE_INDEXED, -16); + c.LDR(X0, SP, 16); + c.STR(X0, X30, 8 * 30); + c.ADD(X0, SP, 32); + c.STR(X0, X30, offsetof(GuestContext, sp)); + c.MRS(X0, oaknut::SystemReg::FPSR); + c.STR(W0, X30, offsetof(GuestContext, fpsr)); + c.MRS(X0, oaknut::SystemReg::FPCR); + c.STR(W0, X30, offsetof(GuestContext, fpcr)); + c.MRS(X0, oaknut::SystemReg::NZCV); + c.STR(W0, X30, offsetof(GuestContext, nzcv)); + c.LDR(X0, SP, POST_INDEXED, 16); + + // Reload our return X30 from the stack, and return. + c.LDR(X30, SP, 8); + c.RET(); +} + +void Patcher::WriteSvcTrampoline(ModuleDestLabel module_dest, u32 svc_id) { + // We are about to start saving state, so we need to lock the context. + this->LockContext(); + + // Store guest X30 to the stack. Then, save the context and restore the stack. + // This will save all registers except PC, but we know PC at patch time. + c.STR(X30, SP, PRE_INDEXED, -16); + c.BL(m_save_context); + c.LDR(X30, SP, POST_INDEXED, 16); + + // Now that we've saved all registers, we can use any registers as scratch. + // Store PC + 4 to arm interface, since we know the instruction offset from the entry point. + oaknut::Label pc_after_svc; + c.MRS(X1, oaknut::SystemReg::TPIDR_EL0); + c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context)); + c.LDR(X2, pc_after_svc); + c.STR(X2, X1, offsetof(GuestContext, pc)); + + // Store SVC number to execute when we return + c.MOV(X2, svc_id); + c.STR(W2, X1, offsetof(GuestContext, svc_swi)); + + // We are calling a SVC. Clear esr_el1 and return it. + static_assert(std::is_same_v<std::underlying_type_t<HaltReason>, u64>); + oaknut::Label retry; + c.ADD(X2, X1, offsetof(GuestContext, esr_el1)); + c.l(retry); + c.LDAXR(X0, X2); + c.STLXR(W3, XZR, X2); + c.CBNZ(W3, retry); + + // Add "calling SVC" flag. Since this is X0, this is now our return value. + c.ORR(X0, X0, static_cast<u64>(HaltReason::SupervisorCall)); + + // Offset the GuestContext pointer to the HostContext member. + // STP has limited range of [-512, 504] which we can't reach otherwise + // NB: Due to this all offsets below are from the start of HostContext. + c.ADD(X1, X1, offsetof(GuestContext, host_ctx)); + + // Reload host TPIDR_EL0 and SP. + static_assert(offsetof(HostContext, host_sp) + 8 == offsetof(HostContext, host_tpidr_el0)); + c.LDP(X2, X3, X1, offsetof(HostContext, host_sp)); + c.MOV(SP, X2); + c.MSR(oaknut::SystemReg::TPIDR_EL0, X3); + + // Load callee-saved host registers and return to host. + static constexpr size_t HOST_REGS_OFF = offsetof(HostContext, host_saved_regs); + static constexpr size_t HOST_VREGS_OFF = offsetof(HostContext, host_saved_vregs); + c.LDP(X19, X20, X1, HOST_REGS_OFF); + c.LDP(X21, X22, X1, HOST_REGS_OFF + 2 * sizeof(u64)); + c.LDP(X23, X24, X1, HOST_REGS_OFF + 4 * sizeof(u64)); + c.LDP(X25, X26, X1, HOST_REGS_OFF + 6 * sizeof(u64)); + c.LDP(X27, X28, X1, HOST_REGS_OFF + 8 * sizeof(u64)); + c.LDP(X29, X30, X1, HOST_REGS_OFF + 10 * sizeof(u64)); + c.LDP(Q8, Q9, X1, HOST_VREGS_OFF); + c.LDP(Q10, Q11, X1, HOST_VREGS_OFF + 2 * sizeof(u128)); + c.LDP(Q12, Q13, X1, HOST_VREGS_OFF + 4 * sizeof(u128)); + c.LDP(Q14, Q15, X1, HOST_VREGS_OFF + 6 * sizeof(u128)); + c.RET(); + + // Write the post-SVC trampoline address, which will jump back to the guest after restoring its + // state. + m_trampolines.push_back({c.offset(), module_dest}); + + // Host called this location. Save the return address so we can + // unwind the stack properly when jumping back. + c.MRS(X2, oaknut::SystemReg::TPIDR_EL0); + c.LDR(X2, X2, offsetof(NativeExecutionParameters, native_context)); + c.ADD(X0, X2, offsetof(GuestContext, host_ctx)); + c.STR(X30, X0, offsetof(HostContext, host_saved_regs) + 11 * sizeof(u64)); + + // Reload all guest registers except X30 and PC. + // The function also expects 16 bytes of stack already allocated. + c.STR(X30, SP, PRE_INDEXED, -16); + c.BL(m_load_context); + c.LDR(X30, SP, POST_INDEXED, 16); + + // Use X1 as a scratch register to restore X30. + c.STR(X1, SP, PRE_INDEXED, -16); + c.MRS(X1, oaknut::SystemReg::TPIDR_EL0); + c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context)); + c.LDR(X30, X1, offsetof(GuestContext, cpu_registers) + sizeof(u64) * 30); + c.LDR(X1, SP, POST_INDEXED, 16); + + // Unlock the context. + this->UnlockContext(); + + // Jump back to the instruction after the emulated SVC. + this->BranchToModule(module_dest); + + // Store PC after call. + c.l(pc_after_svc); + this->WriteModulePc(module_dest); +} + +void Patcher::WriteMrsHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg, + oaknut::SystemReg src_reg) { + // Retrieve emulated TLS register from GuestContext. + c.MRS(dest_reg, oaknut::SystemReg::TPIDR_EL0); + if (src_reg == oaknut::SystemReg::TPIDRRO_EL0) { + c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidrro_el0)); + } else { + c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidr_el0)); + } + + // Jump back to the instruction after the emulated MRS. + this->BranchToModule(module_dest); +} + +void Patcher::WriteMsrHandler(ModuleDestLabel module_dest, oaknut::XReg src_reg) { + const auto scratch_reg = src_reg.index() == 0 ? X1 : X0; + c.STR(scratch_reg, SP, PRE_INDEXED, -16); + + // Save guest value to NativeExecutionParameters::tpidr_el0. + c.MRS(scratch_reg, oaknut::SystemReg::TPIDR_EL0); + c.STR(src_reg, scratch_reg, offsetof(NativeExecutionParameters, tpidr_el0)); + + // Restore scratch register. + c.LDR(scratch_reg, SP, POST_INDEXED, 16); + + // Jump back to the instruction after the emulated MSR. + this->BranchToModule(module_dest); +} + +void Patcher::WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg) { + static Common::Arm64::NativeClock clock{}; + const auto factor = clock.GetGuestCNTFRQFactor(); + const auto raw_factor = Common::BitCast<std::array<u64, 2>>(factor); + + const auto use_x2_x3 = dest_reg.index() == 0 || dest_reg.index() == 1; + oaknut::XReg scratch0 = use_x2_x3 ? X2 : X0; + oaknut::XReg scratch1 = use_x2_x3 ? X3 : X1; + + oaknut::Label factorlo; + oaknut::Label factorhi; + + // Save scratches. + c.STP(scratch0, scratch1, SP, PRE_INDEXED, -16); + + // Load counter value. + c.MRS(dest_reg, oaknut::SystemReg::CNTVCT_EL0); + + // Load scaling factor. + c.LDR(scratch0, factorlo); + c.LDR(scratch1, factorhi); + + // Multiply low bits and get result. + c.UMULH(scratch0, dest_reg, scratch0); + + // Multiply high bits and add low bit result. + c.MADD(dest_reg, dest_reg, scratch1, scratch0); + + // Reload scratches. + c.LDP(scratch0, scratch1, SP, POST_INDEXED, 16); + + // Jump back to the instruction after the emulated MRS. + this->BranchToModule(module_dest); + + // Scaling factor constant values. + c.l(factorlo); + c.dx(raw_factor[0]); + c.l(factorhi); + c.dx(raw_factor[1]); +} + +void Patcher::LockContext() { + oaknut::Label retry; + + // Save scratches. + c.STP(X0, X1, SP, PRE_INDEXED, -16); + + // Reload lock pointer. + c.l(retry); + c.CLREX(); + c.MRS(X0, oaknut::SystemReg::TPIDR_EL0); + c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock)); + + static_assert(SpinLockLocked == 0); + + // Load-linked with acquire ordering. + c.LDAXR(W1, X0); + + // If the value was SpinLockLocked, clear monitor and retry. + c.CBZ(W1, retry); + + // Store-conditional SpinLockLocked with relaxed ordering. + c.STXR(W1, WZR, X0); + + // If we failed to store, retry. + c.CBNZ(W1, retry); + + // We succeeded! Reload scratches. + c.LDP(X0, X1, SP, POST_INDEXED, 16); +} + +void Patcher::UnlockContext() { + // Save scratches. + c.STP(X0, X1, SP, PRE_INDEXED, -16); + + // Load lock pointer. + c.MRS(X0, oaknut::SystemReg::TPIDR_EL0); + c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock)); + + // Load SpinLockUnlocked. + c.MOV(W1, SpinLockUnlocked); + + // Store value with release ordering. + c.STLR(W1, X0); + + // Load scratches. + c.LDP(X0, X1, SP, POST_INDEXED, 16); +} + +} // namespace Core::NCE diff --git a/src/core/arm/nce/patcher.h b/src/core/arm/nce/patcher.h new file mode 100644 index 000000000..c6d1608c1 --- /dev/null +++ b/src/core/arm/nce/patcher.h @@ -0,0 +1,98 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include <span> +#include <unordered_map> +#include <vector> +#include <oaknut/code_block.hpp> +#include <oaknut/oaknut.hpp> + +#include "common/common_types.h" +#include "core/hle/kernel/code_set.h" +#include "core/hle/kernel/k_typed_address.h" +#include "core/hle/kernel/physical_memory.h" + +namespace Core::NCE { + +enum class PatchMode : u32 { + None, + PreText, ///< Patch section is inserted before .text + PostData, ///< Patch section is inserted after .data +}; + +using ModuleTextAddress = u64; +using PatchTextAddress = u64; +using EntryTrampolines = std::unordered_map<ModuleTextAddress, PatchTextAddress>; + +class Patcher { +public: + explicit Patcher(); + ~Patcher(); + + void PatchText(const Kernel::PhysicalMemory& program_image, + const Kernel::CodeSet::Segment& code); + void RelocateAndCopy(Common::ProcessAddress load_base, const Kernel::CodeSet::Segment& code, + Kernel::PhysicalMemory& program_image, EntryTrampolines* out_trampolines); + size_t GetSectionSize() const noexcept; + + [[nodiscard]] PatchMode GetPatchMode() const noexcept { + return mode; + } + +private: + using ModuleDestLabel = uintptr_t; + + struct Trampoline { + ptrdiff_t patch_offset; + uintptr_t module_offset; + }; + + void WriteLoadContext(); + void WriteSaveContext(); + void LockContext(); + void UnlockContext(); + void WriteSvcTrampoline(ModuleDestLabel module_dest, u32 svc_id); + void WriteMrsHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg, + oaknut::SystemReg src_reg); + void WriteMsrHandler(ModuleDestLabel module_dest, oaknut::XReg src_reg); + void WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg); + +private: + void BranchToPatch(uintptr_t module_dest) { + m_branch_to_patch_relocations.push_back({c.offset(), module_dest}); + } + + void BranchToModule(uintptr_t module_dest) { + m_branch_to_module_relocations.push_back({c.offset(), module_dest}); + c.dw(0); + } + + void WriteModulePc(uintptr_t module_dest) { + m_write_module_pc_relocations.push_back({c.offset(), module_dest}); + c.dx(0); + } + +private: + // List of patch instructions we have generated. + std::vector<u32> m_patch_instructions{}; + + // Relocation type for relative branch from module to patch. + struct Relocation { + ptrdiff_t patch_offset; ///< Offset in bytes from the start of the patch section. + uintptr_t module_offset; ///< Offset in bytes from the start of the text section. + }; + + oaknut::VectorCodeGenerator c; + std::vector<Trampoline> m_trampolines; + std::vector<Relocation> m_branch_to_patch_relocations{}; + std::vector<Relocation> m_branch_to_module_relocations{}; + std::vector<Relocation> m_write_module_pc_relocations{}; + std::vector<ModuleTextAddress> m_exclusives{}; + oaknut::Label m_save_context{}; + oaknut::Label m_load_context{}; + PatchMode mode{PatchMode::None}; +}; + +} // namespace Core::NCE |