You're right. These aren't
interrupts in get_s_time_fixed, but rather a cumulative error in the
sequence due to integer rounding. I added logging of the current
local_stime to local_time_calibration and noticed that the timestamp
between cores is gradually increasing. If the server has been running
for weeks, this could be a very large value.
```
@@
-1732,6 +1753,8 @@ static void cf_check local_time_calibration(void)
if
( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
{
/*
Atomically read cpu_calibration struct and write cpu_time struct. */
+
printk("update stime on time calibrate %d, %lu -> %lu
(%lu, %lu)\n", smp_processor_id(), t->stamp.local_stime,
c->local_stime,
+
t->last_seen_ns, t->last_seen_tsc);
local_irq_disable();
t->stamp = *c;
local_irq_enable();
```
2
hours of work:
```
(XEN) update stime
on time calibrate 0, 8564145820102 -> 8565145861597
(8565145862216, 0)
(XEN) update stime
on time calibrate 1, 8564145820129 -> 8565145861609
(8565145863957, 0)
(XEN) update stime
on time calibrate 3, 8564145819996 -> 8565145861491
(8565145864800, 0)
(XEN) update stime
on time calibrate 2, 8564145820099 -> 8565145861609
(8565145865372, 0)
8565145861609 -
8565145861491 = 115 * 3 (3.00 GHz) = 345 lag
```
6 hours
of work:
```
(XEN) update stime
on time calibrate 0, 22914730829200 -> 22915730869993
(22915730870665, 0)
(XEN) update stime
on time calibrate 1, 22914730829073 -> 22915730869889
(22915730870693, 0)
(XEN) update stime
on time calibrate 2, 22914730829052 -> 22915730869841
(22915730872231, 0)
(XEN) update stime
on time calibrate 3, 22914730828892 -> 22915730869696
(22915730872096, 0)
22915730869993 -
22915730869696 = 297 * 3 (3.00 GHz) = 891 lag
```
Clarification,
according to my xen settings:
```
ucode=scan
dom0_mem=53923M,max:53923M dom0_max_vcpus=4-96 dom0_vcpus_pin=0
force-ept=1 ept=no-ad,no-pml hap_1gb=0 hap_2mb=0 altp2m=1
hpet=legacy-replacement smt=1 spec-ctrl=no gnttab_max_frames=512
cpufreq=xen:performance max_cstate=1 sched=credit sched-gran=cpu
apicv=0 sched_credit_tslice_ms=5 sched_ratelimit_us=500
```
Processors I tested
on:
```
Intel(R) Core(TM)
i5-3330 CPU @ 3.00GHz
Flags:
fpu de tsc msr pae mce cx8 apic sep mca cmov pat clflush acpi mmx
fxsr sse sse2 ss ht syscall nx rdtscp lm constant_tsc rep_good nopl
nonstop_tsc cpuid tsc_known_freq pni pclmulqdq monitor est ssse3 cx16
sse4_1 sse4_2 popcnt aes xsave avx f16c hypervisor lahf_lm
cpuid_fault fsgsbase erms xsaveopt arch_capabilities
```
```
Intel(R) Xeon(R)
Gold 5318Y CPU @ 2.10GHz x2 (NUMA)
Flags:
fpu de tsc msr pae mce cx8 apic sep mca cmov pat clflush acpi mmx
fxsr sse sse2 ss ht syscall nx rdtscp lm constant_tsc rep_good nopl
nonstop_tsc cpuid tsc_known_freq pni pclmulqdq monitor est ssse3 fma
cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor
lahf_lm abm 3dnowprefetch cpuid_fault ibpb fsgsbase bmi1 avx2 bmi2
erms rtm avx512f avx512dq rdseed adx avx512ifma clflushopt clwb
avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 avx512vbmi
avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg
avx512_vpopcntdq rdpid fsrm md_clear arch_capabilities
```
Next I moved the
code to r3 to speed up playback:
```
#include <stdint.h>
#include <stdio.h>
static __inline__
unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__
__volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return (
(unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
#define
MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL))
struct
cpu_time_stamp {
uint64_t
local_tsc;
int64_t
local_stime;
int64_t
master_stime;
};
struct time_scale {
int shift;
uint32_t
mul_frac;
};
static inline
uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
uint32_t
quotient, remainder;
asm (
"divl
%4"
: "=a"
(quotient), "=d" (remainder)
: "0"
(0), "1" (dividend), "r" (divisor) );
return quotient;
}
void
set_time_scale(struct time_scale *ts, uint64_t ticks_per_sec)
{
uint64_t tps64 =
ticks_per_sec;
uint32_t tps32;
int shift = 0;
while ( tps64 >
(MILLISECS(1000)*2) )
{
tps64 >>=
1;
shift--;
}
tps32 =
(uint32_t)tps64;
while ( tps32 <=
(uint32_t)MILLISECS(1000) )
{
tps32 <<=
1;
shift++;
}
ts->mul_frac
= div_frac(MILLISECS(1000), tps32);
ts->shift
= shift;
}
uint64_t
scale_delta(uint64_t delta, const struct time_scale *scale)
{
uint64_t
product;
if (
scale->shift < 0 )
delta >>=
-scale->shift;
else
delta <<=
scale->shift;
asm (
"mulq
%2 ; shrd $32,%1,%0"
: "=a"
(product), "=d" (delta)
: "rm"
(delta), "0" ((uint64_t)scale->mul_frac) );
return product;
}
#define
_TS_MUL_FRAC_IDENTITY 0x80000000UL
static inline struct
time_scale scale_reciprocal(struct time_scale scale)
{
struct
time_scale reciprocal;
uint32_t
dividend;
dividend =
_TS_MUL_FRAC_IDENTITY;
reciprocal.shift
= 1 - scale.shift;
while ( dividend
>= scale.mul_frac )
{
dividend >>=
1;
reciprocal.shift++;
}
asm (
"divl
%4"
: "=a"
(reciprocal.mul_frac), "=d" (dividend)
: "0"
(0), "1" (dividend), "r" (scale.mul_frac) );
return
reciprocal;
}
int64_t
get_s_time_fixed(const struct cpu_time_stamp *t, const struct
time_scale *ts, uint64_t at_tsc)
{
uint64_t tsc,
delta;
if ( at_tsc )
tsc =
at_tsc;
else
tsc =
rdtsc();
delta = tsc -
t->local_tsc;
return
t->local_stime + scale_delta(delta, ts);
}
int main() {
struct
cpu_time_stamp ct;
struct
time_scale ts;
struct
time_scale back;
uint64_t start =
rdtsc();
set_time_scale(&ts, 3000000000);
back =
scale_reciprocal(ts);
ct.local_tsc =
start;
ct.local_stime =
scale_delta(start, &ts);
while (1) {
uint64_t
new_tsc = rdtsc();
ct.local_stime = get_s_time_fixed(&ct, &ts, new_tsc);
ct.local_tsc
= new_tsc;
uint64_t
tmp_tsc = rdtsc();
printf("%lu
%lu\n", tmp_tsc, scale_delta(get_s_time_fixed(&ct, &ts,
tmp_tsc), &back));
}
return 0;
}
```
It's enough to run
the loop for 10-20 seconds to see the problem. scale_delta rounds the
least significant bits during calculation, which causes the error to
accumulate, at different rates on different cores, depending on the
least significant bits at the time of calibration.
Now let's talk about
why dwm reacts this way. When a snapshot is reversed, last_guest_time
in hvm_get_guest_time_fixed is set to 0, which doesn't prevent time
from flowing backwards. This means that cache_tsc_offset in
hvm_get_guest_tsc_fixed may be configured correctly on one physical
core, but due to shedding on a physical core with a lagging tsc, the
guest may occasionally see a tsc value lower than it saw before the
snapshot was reversed. If this happens to the dwm code, it terminates
with an error.
A quick solution to
this problem might be to save the last_seen_tsc parameter in a
snapshot for each core, followed by validation.
The correct solution
is to remove the rounding of the least significant bits from the
sequence.