diff options
Diffstat (limited to 'src/common/x64')
-rw-r--r-- | src/common/x64/native_clock.cpp | 118 | ||||
-rw-r--r-- | src/common/x64/native_clock.h | 28 | ||||
-rw-r--r-- | src/common/x64/xbyak_abi.h | 44 |
3 files changed, 138 insertions, 52 deletions
diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index 424b39b1f..a65f6b832 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -2,19 +2,74 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <array> #include <chrono> +#include <limits> #include <mutex> #include <thread> #ifdef _MSC_VER #include <intrin.h> + +#pragma intrinsic(__umulh) +#pragma intrinsic(_udiv128) #else #include <x86intrin.h> #endif +#include "common/atomic_ops.h" #include "common/uint128.h" #include "common/x64/native_clock.h" +namespace { + +[[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) { +#ifdef __SIZEOF_INT128__ + const auto base = static_cast<unsigned __int128>(numerator) << 64ULL; + return static_cast<u64>(base / divisor); +#elif defined(_M_X64) || defined(_M_ARM64) + std::array<u64, 2> r = {0, numerator}; + u64 remainder; +#if _MSC_VER < 1923 + return udiv128(r[1], r[0], divisor, &remainder); +#else + return _udiv128(r[1], r[0], divisor, &remainder); +#endif +#else + // This one is bit more inaccurate. + return MultiplyAndDivide64(std::numeric_limits<u64>::max(), numerator, divisor); +#endif +} + +[[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) { +#ifdef __SIZEOF_INT128__ + return (static_cast<unsigned __int128>(a) * static_cast<unsigned __int128>(b)) >> 64; +#elif defined(_M_X64) || defined(_M_ARM64) + return __umulh(a, b); // MSVC +#else + // Generic fallback + const u64 a_lo = u32(a); + const u64 a_hi = a >> 32; + const u64 b_lo = u32(b); + const u64 b_hi = b >> 32; + + const u64 a_x_b_hi = a_hi * b_hi; + const u64 a_x_b_mid = a_hi * b_lo; + const u64 b_x_a_mid = b_hi * a_lo; + const u64 a_x_b_lo = a_lo * b_lo; + + const u64 carry_bit = (static_cast<u64>(static_cast<u32>(a_x_b_mid)) + + static_cast<u64>(static_cast<u32>(b_x_a_mid)) + (a_x_b_lo >> 32)) >> + 32; + + const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit; + + return multhi; +#endif +} + +} // namespace + namespace Common { u64 EstimateRDTSCFrequency() { @@ -43,59 +98,76 @@ u64 EstimateRDTSCFrequency() { } namespace X64 { -NativeClock::NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency, - u64 rtsc_frequency) - : WallClock(emulated_cpu_frequency, emulated_clock_frequency, true), rtsc_frequency{ - rtsc_frequency} { +NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_, + u64 rtsc_frequency_) + : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ + rtsc_frequency_} { _mm_mfence(); - last_measure = __rdtsc(); - accumulated_ticks = 0U; + time_point.inner.last_measure = __rdtsc(); + time_point.inner.accumulated_ticks = 0U; + ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency); + us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency); + ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency); + clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency); + cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency); } u64 NativeClock::GetRTSC() { - std::scoped_lock scope{rtsc_serialize}; - _mm_mfence(); - const u64 current_measure = __rdtsc(); - u64 diff = current_measure - last_measure; - diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0) - if (current_measure > last_measure) { - last_measure = current_measure; - } - accumulated_ticks += diff; + TimePoint new_time_point{}; + TimePoint current_time_point{}; + do { + current_time_point.pack = time_point.pack; + _mm_mfence(); + const u64 current_measure = __rdtsc(); + u64 diff = current_measure - current_time_point.inner.last_measure; + diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0) + new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure + ? current_measure + : current_time_point.inner.last_measure; + new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff; + } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, + current_time_point.pack)); /// The clock cannot be more precise than the guest timer, remove the lower bits - return accumulated_ticks & inaccuracy_mask; + return new_time_point.inner.accumulated_ticks & inaccuracy_mask; } void NativeClock::Pause(bool is_paused) { if (!is_paused) { - _mm_mfence(); - last_measure = __rdtsc(); + TimePoint current_time_point{}; + TimePoint new_time_point{}; + do { + current_time_point.pack = time_point.pack; + new_time_point.pack = current_time_point.pack; + _mm_mfence(); + new_time_point.inner.last_measure = __rdtsc(); + } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, + current_time_point.pack)); } } std::chrono::nanoseconds NativeClock::GetTimeNS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)}; + return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)}; } std::chrono::microseconds NativeClock::GetTimeUS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)}; + return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)}; } std::chrono::milliseconds NativeClock::GetTimeMS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)}; + return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)}; } u64 NativeClock::GetClockCycles() { const u64 rtsc_value = GetRTSC(); - return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency); + return MultiplyHigh(rtsc_value, clock_rtsc_factor); } u64 NativeClock::GetCPUCycles() { const u64 rtsc_value = GetRTSC(); - return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency); + return MultiplyHigh(rtsc_value, cpu_rtsc_factor); } } // namespace X64 diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h index 891a3bbfd..7cbd400d2 100644 --- a/src/common/x64/native_clock.h +++ b/src/common/x64/native_clock.h @@ -6,15 +6,15 @@ #include <optional> -#include "common/spin_lock.h" #include "common/wall_clock.h" namespace Common { namespace X64 { -class NativeClock : public WallClock { +class NativeClock final : public WallClock { public: - NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency, u64 rtsc_frequency); + explicit NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_, + u64 rtsc_frequency_); std::chrono::nanoseconds GetTimeNS() override; @@ -31,14 +31,28 @@ public: private: u64 GetRTSC(); + union alignas(16) TimePoint { + TimePoint() : pack{} {} + u128 pack{}; + struct Inner { + u64 last_measure{}; + u64 accumulated_ticks{}; + } inner; + }; + /// value used to reduce the native clocks accuracy as some apss rely on /// undefined behavior where the level of accuracy in the clock shouldn't /// be higher. - static constexpr u64 inaccuracy_mask = ~(0x400 - 1); + static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1); + + TimePoint time_point; + // factors + u64 clock_rtsc_factor{}; + u64 cpu_rtsc_factor{}; + u64 ns_rtsc_factor{}; + u64 us_rtsc_factor{}; + u64 ms_rtsc_factor{}; - SpinLock rtsc_serialize{}; - u64 last_measure{}; - u64 accumulated_ticks{}; u64 rtsc_frequency; }; } // namespace X64 diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h index a5f5d4fc1..c2c9b6134 100644 --- a/src/common/x64/xbyak_abi.h +++ b/src/common/x64/xbyak_abi.h @@ -11,25 +11,25 @@ namespace Common::X64 { -inline std::size_t RegToIndex(const Xbyak::Reg& reg) { +constexpr size_t RegToIndex(const Xbyak::Reg& reg) { using Kind = Xbyak::Reg::Kind; ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0, "RegSet only support GPRs and XMM registers."); ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15."); - return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16); + return static_cast<size_t>(reg.getIdx()) + (reg.getKind() == Kind::REG ? 0 : 16); } -inline Xbyak::Reg64 IndexToReg64(std::size_t reg_index) { +constexpr Xbyak::Reg64 IndexToReg64(size_t reg_index) { ASSERT(reg_index < 16); return Xbyak::Reg64(static_cast<int>(reg_index)); } -inline Xbyak::Xmm IndexToXmm(std::size_t reg_index) { +constexpr Xbyak::Xmm IndexToXmm(size_t reg_index) { ASSERT(reg_index >= 16 && reg_index < 32); return Xbyak::Xmm(static_cast<int>(reg_index - 16)); } -inline Xbyak::Reg IndexToReg(std::size_t reg_index) { +constexpr Xbyak::Reg IndexToReg(size_t reg_index) { if (reg_index < 16) { return IndexToReg64(reg_index); } else { @@ -45,17 +45,17 @@ inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) { return bits; } -const std::bitset<32> ABI_ALL_GPRS(0x0000FFFF); -const std::bitset<32> ABI_ALL_XMMS(0xFFFF0000); +constexpr inline std::bitset<32> ABI_ALL_GPRS(0x0000FFFF); +constexpr inline std::bitset<32> ABI_ALL_XMMS(0xFFFF0000); #ifdef _WIN32 // Microsoft x64 ABI -const Xbyak::Reg ABI_RETURN = Xbyak::util::rax; -const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx; -const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx; -const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8; -const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9; +constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax; +constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx; +constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx; +constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8; +constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9; const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({ // GPRs @@ -102,11 +102,11 @@ constexpr size_t ABI_SHADOW_SPACE = 0x20; #else // System V x86-64 ABI -const Xbyak::Reg ABI_RETURN = Xbyak::util::rax; -const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi; -const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi; -const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx; -const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx; +constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax; +constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi; +constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi; +constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx; +constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx; const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({ // GPRs @@ -182,7 +182,7 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b size_t rsp_alignment, size_t needed_frame_size = 0) { auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size); - for (std::size_t i = 0; i < regs.size(); ++i) { + for (size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_GPRS[i]) { code.push(IndexToReg64(i)); } @@ -192,7 +192,7 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b code.sub(code.rsp, frame_info.subtraction); } - for (std::size_t i = 0; i < regs.size(); ++i) { + for (size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_XMMS[i]) { code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i)); frame_info.xmm_offset += 0x10; @@ -206,7 +206,7 @@ inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bits size_t rsp_alignment, size_t needed_frame_size = 0) { auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size); - for (std::size_t i = 0; i < regs.size(); ++i) { + for (size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_XMMS[i]) { code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]); frame_info.xmm_offset += 0x10; @@ -218,8 +218,8 @@ inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bits } // GPRs need to be popped in reverse order - for (std::size_t j = 0; j < regs.size(); ++j) { - const std::size_t i = regs.size() - j - 1; + for (size_t j = 0; j < regs.size(); ++j) { + const size_t i = regs.size() - j - 1; if (regs[i] && ABI_ALL_GPRS[i]) { code.pop(IndexToReg64(i)); } |