3 files changed, 138 insertions, 52 deletions
diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
index 424b39b1f..a65f6b832 100644
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -2,19 +2,74 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <array>
 #include <chrono>
+#include <limits>
 #include <mutex>
 #include <thread>
 
 #ifdef _MSC_VER
 #include <intrin.h>
+
+#pragma intrinsic(__umulh)
+#pragma intrinsic(_udiv128)
 #else
 #include <x86intrin.h>
 #endif
 
+#include "common/atomic_ops.h"
 #include "common/uint128.h"
 #include "common/x64/native_clock.h"
 
+namespace {
+
+[[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) {
+#ifdef __SIZEOF_INT128__
+    const auto base = static_cast<unsigned __int128>(numerator) << 64ULL;
+    return static_cast<u64>(base / divisor);
+#elif defined(_M_X64) || defined(_M_ARM64)
+    std::array<u64, 2> r = {0, numerator};
+    u64 remainder;
+#if _MSC_VER < 1923
+    return udiv128(r[1], r[0], divisor, &remainder);
+#else
+    return _udiv128(r[1], r[0], divisor, &remainder);
+#endif
+#else
+    // This one is bit more inaccurate.
+    return MultiplyAndDivide64(std::numeric_limits<u64>::max(), numerator, divisor);
+#endif
+}
+
+[[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) {
+#ifdef __SIZEOF_INT128__
+    return (static_cast<unsigned __int128>(a) * static_cast<unsigned __int128>(b)) >> 64;
+#elif defined(_M_X64) || defined(_M_ARM64)
+    return __umulh(a, b); // MSVC
+#else
+    // Generic fallback
+    const u64 a_lo = u32(a);
+    const u64 a_hi = a >> 32;
+    const u64 b_lo = u32(b);
+    const u64 b_hi = b >> 32;
+
+    const u64 a_x_b_hi = a_hi * b_hi;
+    const u64 a_x_b_mid = a_hi * b_lo;
+    const u64 b_x_a_mid = b_hi * a_lo;
+    const u64 a_x_b_lo = a_lo * b_lo;
+
+    const u64 carry_bit = (static_cast<u64>(static_cast<u32>(a_x_b_mid)) +
+                           static_cast<u64>(static_cast<u32>(b_x_a_mid)) + (a_x_b_lo >> 32)) >>
+                          32;
+
+    const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit;
+
+    return multhi;
+#endif
+}
+
+} // namespace
+
 namespace Common {
 
 u64 EstimateRDTSCFrequency() {
@@ -43,59 +98,76 @@ u64 EstimateRDTSCFrequency() {
 }
 
 namespace X64 {
-NativeClock::NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency,
-                         u64 rtsc_frequency)
-    : WallClock(emulated_cpu_frequency, emulated_clock_frequency, true), rtsc_frequency{
-                                                                             rtsc_frequency} {
+NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_,
+                         u64 rtsc_frequency_)
+    : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{
+                                                                               rtsc_frequency_} {
     _mm_mfence();
-    last_measure = __rdtsc();
-    accumulated_ticks = 0U;
+    time_point.inner.last_measure = __rdtsc();
+    time_point.inner.accumulated_ticks = 0U;
+    ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency);
+    us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency);
+    ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency);
+    clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency);
+    cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency);
 }
 
 u64 NativeClock::GetRTSC() {
-    std::scoped_lock scope{rtsc_serialize};
-    _mm_mfence();
-    const u64 current_measure = __rdtsc();
-    u64 diff = current_measure - last_measure;
-    diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
-    if (current_measure > last_measure) {
-        last_measure = current_measure;
-    }
-    accumulated_ticks += diff;
+    TimePoint new_time_point{};
+    TimePoint current_time_point{};
+    do {
+        current_time_point.pack = time_point.pack;
+        _mm_mfence();
+        const u64 current_measure = __rdtsc();
+        u64 diff = current_measure - current_time_point.inner.last_measure;
+        diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
+        new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure
+                                                ? current_measure
+                                                : current_time_point.inner.last_measure;
+        new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
+    } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
+                                           current_time_point.pack));
     /// The clock cannot be more precise than the guest timer, remove the lower bits
-    return accumulated_ticks & inaccuracy_mask;
+    return new_time_point.inner.accumulated_ticks & inaccuracy_mask;
 }
 
 void NativeClock::Pause(bool is_paused) {
     if (!is_paused) {
-        _mm_mfence();
-        last_measure = __rdtsc();
+        TimePoint current_time_point{};
+        TimePoint new_time_point{};
+        do {
+            current_time_point.pack = time_point.pack;
+            new_time_point.pack = current_time_point.pack;
+            _mm_mfence();
+            new_time_point.inner.last_measure = __rdtsc();
+        } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
+                                               current_time_point.pack));
     }
 }
 
 std::chrono::nanoseconds NativeClock::GetTimeNS() {
     const u64 rtsc_value = GetRTSC();
-    return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)};
+    return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)};
 }
 
 std::chrono::microseconds NativeClock::GetTimeUS() {
     const u64 rtsc_value = GetRTSC();
-    return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)};
+    return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)};
 }
 
 std::chrono::milliseconds NativeClock::GetTimeMS() {
     const u64 rtsc_value = GetRTSC();
-    return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)};
+    return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)};
 }
 
 u64 NativeClock::GetClockCycles() {
     const u64 rtsc_value = GetRTSC();
-    return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency);
+    return MultiplyHigh(rtsc_value, clock_rtsc_factor);
 }
 
 u64 NativeClock::GetCPUCycles() {
     const u64 rtsc_value = GetRTSC();
-    return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency);
+    return MultiplyHigh(rtsc_value, cpu_rtsc_factor);
 }
 
 } // namespace X64
diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h
index 891a3bbfd..7cbd400d2 100644
--- a/src/common/x64/native_clock.h
+++ b/src/common/x64/native_clock.h
@@ -6,15 +6,15 @@
 
 #include <optional>
 
-#include "common/spin_lock.h"
 #include "common/wall_clock.h"
 
 namespace Common {
 
 namespace X64 {
-class NativeClock : public WallClock {
+class NativeClock final : public WallClock {
 public:
-    NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency, u64 rtsc_frequency);
+    explicit NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_,
+                         u64 rtsc_frequency_);
 
     std::chrono::nanoseconds GetTimeNS() override;
 
@@ -31,14 +31,28 @@ public:
 private:
     u64 GetRTSC();
 
+    union alignas(16) TimePoint {
+        TimePoint() : pack{} {}
+        u128 pack{};
+        struct Inner {
+            u64 last_measure{};
+            u64 accumulated_ticks{};
+        } inner;
+    };
+
     /// value used to reduce the native clocks accuracy as some apss rely on
     /// undefined behavior where the level of accuracy in the clock shouldn't
     /// be higher.
-    static constexpr u64 inaccuracy_mask = ~(0x400 - 1);
+    static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1);
+
+    TimePoint time_point;
+    // factors
+    u64 clock_rtsc_factor{};
+    u64 cpu_rtsc_factor{};
+    u64 ns_rtsc_factor{};
+    u64 us_rtsc_factor{};
+    u64 ms_rtsc_factor{};
 
-    SpinLock rtsc_serialize{};
-    u64 last_measure{};
-    u64 accumulated_ticks{};
     u64 rtsc_frequency;
 };
 } // namespace X64
diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h
index a5f5d4fc1..c2c9b6134 100644
--- a/src/common/x64/xbyak_abi.h
+++ b/src/common/x64/xbyak_abi.h
@@ -11,25 +11,25 @@
 
 namespace Common::X64 {
 
-inline std::size_t RegToIndex(const Xbyak::Reg& reg) {
+constexpr size_t RegToIndex(const Xbyak::Reg& reg) {
     using Kind = Xbyak::Reg::Kind;
     ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
                "RegSet only support GPRs and XMM registers.");
     ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
-    return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
+    return static_cast<size_t>(reg.getIdx()) + (reg.getKind() == Kind::REG ? 0 : 16);
 }
 
-inline Xbyak::Reg64 IndexToReg64(std::size_t reg_index) {
+constexpr Xbyak::Reg64 IndexToReg64(size_t reg_index) {
     ASSERT(reg_index < 16);
     return Xbyak::Reg64(static_cast<int>(reg_index));
 }
 
-inline Xbyak::Xmm IndexToXmm(std::size_t reg_index) {
+constexpr Xbyak::Xmm IndexToXmm(size_t reg_index) {
     ASSERT(reg_index >= 16 && reg_index < 32);
     return Xbyak::Xmm(static_cast<int>(reg_index - 16));
 }
 
-inline Xbyak::Reg IndexToReg(std::size_t reg_index) {
+constexpr Xbyak::Reg IndexToReg(size_t reg_index) {
     if (reg_index < 16) {
         return IndexToReg64(reg_index);
     } else {
@@ -45,17 +45,17 @@ inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
     return bits;
 }
 
-const std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
-const std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
+constexpr inline std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
+constexpr inline std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
 
 #ifdef _WIN32
 
 // Microsoft x64 ABI
-const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
-const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
-const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
-const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
-const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
+constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
+constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
+constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
+constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
 
 const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
     // GPRs
@@ -102,11 +102,11 @@ constexpr size_t ABI_SHADOW_SPACE = 0x20;
 #else
 
 // System V x86-64 ABI
-const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
-const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
-const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
-const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
-const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
+constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
+constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
+constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
+constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
 
 const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
     // GPRs
@@ -182,7 +182,7 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b
                                               size_t rsp_alignment, size_t needed_frame_size = 0) {
     auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
 
-    for (std::size_t i = 0; i < regs.size(); ++i) {
+    for (size_t i = 0; i < regs.size(); ++i) {
         if (regs[i] && ABI_ALL_GPRS[i]) {
             code.push(IndexToReg64(i));
         }
@@ -192,7 +192,7 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b
         code.sub(code.rsp, frame_info.subtraction);
     }
 
-    for (std::size_t i = 0; i < regs.size(); ++i) {
+    for (size_t i = 0; i < regs.size(); ++i) {
         if (regs[i] && ABI_ALL_XMMS[i]) {
             code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i));
             frame_info.xmm_offset += 0x10;
@@ -206,7 +206,7 @@ inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bits
                                            size_t rsp_alignment, size_t needed_frame_size = 0) {
     auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
 
-    for (std::size_t i = 0; i < regs.size(); ++i) {
+    for (size_t i = 0; i < regs.size(); ++i) {
         if (regs[i] && ABI_ALL_XMMS[i]) {
             code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]);
             frame_info.xmm_offset += 0x10;
@@ -218,8 +218,8 @@ inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bits
     }
 
     // GPRs need to be popped in reverse order
-    for (std::size_t j = 0; j < regs.size(); ++j) {
-        const std::size_t i = regs.size() - j - 1;
+    for (size_t j = 0; j < regs.size(); ++j) {
+        const size_t i = regs.size() - j - 1;
         if (regs[i] && ABI_ALL_GPRS[i]) {
             code.pop(IndexToReg64(i));
         }