From d4f871cb6a616f6b1a8a76049e042118571f3dd3 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 1 Jan 2021 23:28:55 +0100 Subject: X86/NativeClock: Improve performance of clock calculations on hot path. --- src/common/x64/native_clock.cpp | 69 ++++++++++++++++++++++++++++++++++++++--- src/common/x64/native_clock.h | 7 +++++ 2 files changed, 71 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index eb8a7782f..e246432d0 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -2,12 +2,17 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include #include +#include #include #include #ifdef _MSC_VER #include + +#pragma intrinsic(__umulh) +#pragma intrinsic(_udiv128) #else #include #endif @@ -15,6 +20,55 @@ #include "common/uint128.h" #include "common/x64/native_clock.h" +namespace { + +[[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) { +#ifdef __SIZEOF_INT128__ + const auto base = static_cast(numerator) << 64ULL; + return static_cast(base / divisor); +#elif defined(_M_X64) || defined(_M_ARM64) + std::array r = {0, numerator}; + u64 remainder; +#if _MSC_VER < 1923 + return udiv128(r[1], r[0], divisor, &remainder); +#else + return _udiv128(r[1], r[0], divisor, &remainder); +#endif +#else + // This one is bit more inaccurate. + return MultiplyAndDivide64(std::numeric_limits::max(), numerator, divisor); +#endif +} + +[[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) { +#ifdef __SIZEOF_INT128__ + return (static_cast(a) * static_cast(b)) >> 64; +#elif defined(_M_X64) || defined(_M_ARM64) + return __umulh(a, b); // MSVC +#else + // Generic fallback + const u64 a_lo = u32(a); + const u64 a_hi = a >> 32; + const u64 b_lo = u32(b); + const u64 b_hi = b >> 32; + + const u64 a_x_b_hi = a_hi * b_hi; + const u64 a_x_b_mid = a_hi * b_lo; + const u64 b_x_a_mid = b_hi * a_lo; + const u64 a_x_b_lo = a_lo * b_lo; + + const u64 carry_bit = (static_cast(static_cast(a_x_b_mid)) + + static_cast(static_cast(b_x_a_mid)) + (a_x_b_lo >> 32)) >> + 32; + + const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit; + + return multhi; +#endif +} + +} // namespace + namespace Common { u64 EstimateRDTSCFrequency() { @@ -50,6 +104,11 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen _mm_mfence(); last_measure = __rdtsc(); accumulated_ticks = 0U; + ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency); + us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency); + ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency); + clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency); + cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency); } u64 NativeClock::GetRTSC() { @@ -75,27 +134,27 @@ void NativeClock::Pause(bool is_paused) { std::chrono::nanoseconds NativeClock::GetTimeNS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)}; + return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)}; } std::chrono::microseconds NativeClock::GetTimeUS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)}; + return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)}; } std::chrono::milliseconds NativeClock::GetTimeMS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)}; + return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)}; } u64 NativeClock::GetClockCycles() { const u64 rtsc_value = GetRTSC(); - return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency); + return MultiplyHigh(rtsc_value, clock_rtsc_factor); } u64 NativeClock::GetCPUCycles() { const u64 rtsc_value = GetRTSC(); - return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency); + return MultiplyHigh(rtsc_value, cpu_rtsc_factor); } } // namespace X64 diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h index 6d1e32ac8..a7b1ee9e0 100644 --- a/src/common/x64/native_clock.h +++ b/src/common/x64/native_clock.h @@ -41,6 +41,13 @@ private: u64 last_measure{}; u64 accumulated_ticks{}; u64 rtsc_frequency; + + // factors + u64 ns_rtsc_factor{}; + u64 us_rtsc_factor{}; + u64 ms_rtsc_factor{}; + u64 clock_rtsc_factor{}; + u64 cpu_rtsc_factor{}; }; } // namespace X64 -- cgit v1.2.3