WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [RFC] [PATCH] use "reliable" tsc properly when available, bu

To: "Xen-Devel (E-mail)" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [RFC] [PATCH] use "reliable" tsc properly when available, but verify
From: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
Date: Mon, 28 Sep 2009 13:19:03 -0700 (PDT)
Delivery-date: Mon, 28 Sep 2009 13:19:52 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
(This is compile-tested only.  Review requested especially
where marked with FIXME.  Also, code that does write_tsc
in smpboot.c may also need patching.  And Linux does
a touch_nmi_watchdog()... does Xen need to do
anything similar?)

Most modern Intel and AMD processors and servers have
fully synchronized, non-stop TSCs that don't even stop
in C3 state.  Recent upstream Linux kernels test a cpuid
bit and record this capability as
X86_FEATURE_TSC_RELIABLE.  According to Intel, all
recent Intel processors AND the systems built on them
have this property.  According to AMD, many recent AMD
processors (and all recent server processors) have
this property but apparently some of the systems built
on them do not.

So we trust but verify... if the cpuid bit is set
we assume a reliable tsc, but use the Linux boottime
check_tsc_warp algorithm to periodically verify that
the tsc hasn't skewed.  If it has, we fall back to Xen
managing (and periodically writing) the TSC.

The check_tsc_warp algorithm is CPU-intensive, so
we test it on a decaying schedule, at 1sec, 2sec,
4sec, 8sec, 16sec, 32sec, etc.

Also correct mis-spelling of NOSTOP to NONSTOP to
match the Linux spelling.

Signed-off-by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>

diff -r 1e33261a814f xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/Makefile     Mon Sep 28 13:26:11 2009 -0600
@@ -45,6 +45,7 @@ obj-y += string.o
 obj-y += string.o
 obj-y += sysctl.o
 obj-y += time.o
+obj-y += tsc_sync.o
 obj-y += trace.o
 obj-y += traps.o
 obj-y += usercopy.o
diff -r 1e33261a814f xen/arch/x86/cpu/amd.c
--- a/xen/arch/x86/cpu/amd.c    Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/cpu/amd.c    Mon Sep 28 13:26:11 2009 -0600
@@ -463,7 +463,9 @@ static void __devinit init_amd(struct cp
                c->x86_power = cpuid_edx(0x80000007);
                if (c->x86_power & (1<<8)) {
                        set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-                       set_bit(X86_FEATURE_NOSTOP_TSC, c->x86_capability);
+                       set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
+                        if ( c->x86 != 0x11 )
+                           set_bit(X86_FEATURE_TSC_RELIABLE, 
c->x86_capability);
                }
        }
 
diff -r 1e33261a814f xen/arch/x86/cpu/intel.c
--- a/xen/arch/x86/cpu/intel.c  Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/cpu/intel.c  Mon Sep 28 13:26:11 2009 -0600
@@ -226,7 +226,8 @@ static void __devinit init_intel(struct 
                set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
        if (cpuid_edx(0x80000007) & (1u<<8)) {
                set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-               set_bit(X86_FEATURE_NOSTOP_TSC, c->x86_capability);
+               set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
+               set_bit(X86_FEATURE_TSC_RELIABLE, c->x86_capability);
        }
        if ((c->cpuid_level >= 0x00000006) &&
            (cpuid_eax(0x00000006) & (1u<<2)))
diff -r 1e33261a814f xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/time.c       Mon Sep 28 13:26:11 2009 -0600
@@ -698,7 +698,7 @@ void cstate_restore_tsc(void)
     s_time_t stime_delta;
     u64 new_tsc;
 
-    if ( boot_cpu_has(X86_FEATURE_NOSTOP_TSC) )
+    if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
         return;
 
     stime_delta = read_platform_stime() - t->stime_master_stamp;
@@ -1100,6 +1100,11 @@ struct calibration_rendezvous {
     u64 master_tsc_stamp;
 };
 
+static void (*rendezvous_func) (void *info);
+static int tsc_reliable = 0;
+static unsigned long tsc_max_warp = 0;
+static unsigned long tsc_verify_decay = 0;
+
 static void time_calibration_tsc_rendezvous(void *_r)
 {
     int i;
@@ -1180,6 +1185,50 @@ static void time_calibration_std_rendezv
     raise_softirq(TIME_CALIBRATE_SOFTIRQ);
 }
 
+static void time_verify_tsc_calibration_rendezvous(void *_r)
+{
+    struct cpu_calibration *c = &this_cpu(cpu_calibration);
+    struct calibration_rendezvous *r = _r;
+    unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
+
+    /* check_tsc_warp is VERY expensive so test only on log2 intervals */
+    tsc_verify_decay++;
+    if ( !(tsc_verify_decay & (tsc_verify_decay-1)) )
+    {
+        if ( smp_processor_id() == 0 )
+        {
+            while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
+                mb();
+            check_tsc_warp(cpu_khz, &tsc_max_warp);
+            atomic_inc(&r->semaphore);
+        }
+        else
+        {
+            atomic_inc(&r->semaphore);
+            while ( atomic_read(&r->semaphore) < total_cpus )
+                mb();
+            check_tsc_warp(cpu_khz, &tsc_max_warp);
+            atomic_inc(&r->semaphore);
+            while ( atomic_read(&r->semaphore) > total_cpus )
+                mb();
+        }
+    }
+
+    if ( tsc_max_warp && smp_processor_id() == 0 )
+    {
+        printk("TSC warp detected (%lu cycles), disabling reliable TSC\n",
+                tsc_max_warp);
+        tsc_reliable = -1;
+        rendezvous_func = time_calibration_tsc_rendezvous;
+    }
+
+    rdtscll(c->local_tsc_stamp);
+    c->stime_local_stamp = get_s_time();
+    c->stime_master_stamp = r->master_stime;
+
+    raise_softirq(TIME_CALIBRATE_SOFTIRQ);
+}
+
 static void time_calibration(void *unused)
 {
     struct calibration_rendezvous r = {
@@ -1188,11 +1237,7 @@ static void time_calibration(void *unuse
     };
 
     /* @wait=1 because we must wait for all cpus before freeing @r. */
-    on_selected_cpus(&r.cpu_calibration_map,
-                     opt_consistent_tscs
-                     ? time_calibration_tsc_rendezvous
-                     : time_calibration_std_rendezvous,
-                     &r, 1);
+    on_selected_cpus(&r.cpu_calibration_map, rendezvous_func, &r, 1);
 }
 
 void init_percpu_time(void)
@@ -1219,16 +1264,19 @@ void init_percpu_time(void)
 /* Late init function (after all CPUs are booted). */
 int __init init_xen_time(void)
 {
-    if ( !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
-        opt_consistent_tscs = 0;
-
-    /* If we have constant TSCs then scale factor can be shared. */
-    if ( opt_consistent_tscs )
+    /* If we have reliable TSCs then scale factor can be shared. */
+    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
     {
         int cpu;
         for_each_possible_cpu ( cpu )
             per_cpu(cpu_time, cpu).tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
+        rendezvous_func = time_verify_tsc_calibration_rendezvous;
+        tsc_reliable = 1;
     }
+    else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+        rendezvous_func = time_calibration_tsc_rendezvous;
+    else
+        rendezvous_func = time_calibration_std_rendezvous;
 
     open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
 
@@ -1463,6 +1511,13 @@ static void dump_softtsc(unsigned char k
     struct domain *d;
     int domcnt = 0;
 
+    if ( tsc_reliable > 0 )
+        printk("TSC is reliable\n");
+    else if ( tsc_reliable < 0 )
+        printk("Hardware determined TSC reliable, verification failed with "
+               "warp = %lu cycles\n", tsc_max_warp);
+    else
+        printk("TSC is not reliable\n");
     for_each_domain ( d )
     {
         if ( !d->arch.vtsc )
diff -r 1e33261a814f xen/arch/x86/tsc_sync.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/tsc_sync.c   Mon Sep 28 13:26:11 2009 -0600
@@ -0,0 +1,93 @@
+/*
+ * check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ * Modified for Xen by Dan Magenheimer, Oracle Corp.
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ *   protects against more than 2 CPUs entering this code. )
+ */
+#include <xen/config.h>
+#include <xen/spinlock.h>
+#include <asm/processor.h>
+#include <asm/time.h>
+
+/* FIXME Are these OK for Xen? Xen has no _raw_spin_lock() */
+#define rdtsc_barrier  mb
+#define raw_spinlock_t spinlock_t
+#define __raw_spin_lock spin_lock
+#define __raw_spin_unlock spin_unlock
+#define __RAW_SPIN_LOCK_UNLOCKED  SPIN_LOCK_UNLOCKED
+
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
+static __cpuinitdata cycles_t last_tsc;
+
+/*
+ * TSC-warp measurement loop running on both CPUs:
+ */
+void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
+{
+       cycles_t start, now, prev, end;
+       int i;
+
+       rdtsc_barrier();
+       start = get_cycles();
+       rdtsc_barrier();
+       /*
+        * The measurement runs for 2 msecs:
+        */
+       end = start + tsc_khz * 2ULL;
+       now = start;
+
+       for (i = 0; ; i++) {
+               /*
+                * We take the global lock, measure TSC, save the
+                * previous TSC that was measured (possibly on
+                * another CPU) and update the previous TSC timestamp.
+                */
+               __raw_spin_lock(&sync_lock);
+               prev = last_tsc;
+               rdtsc_barrier();
+               now = get_cycles();
+               rdtsc_barrier();
+               last_tsc = now;
+               __raw_spin_unlock(&sync_lock);
+
+               /*
+                * Be nice every now and then (and also check whether
+                * measurement is done [we also insert a 10 million
+                * loops safety exit, so we dont lock up in case the
+                * TSC readout is totally broken]):
+                */
+               if (unlikely(!(i & 7))) {
+                       if (now > end || i > 10000000)
+                               break;
+                       cpu_relax();
+                       /*touch_nmi_watchdog();*/
+               }
+               /*
+                * Outside the critical section we can now see whether
+                * we saw a time-warp of the TSC going backwards:
+                */
+               if (unlikely(prev > now)) {
+                       __raw_spin_lock(&sync_lock);
+                       if ( *max_warp > prev - now )
+                               *max_warp = prev - now;
+                       __raw_spin_unlock(&sync_lock);
+               }
+       }
+}
diff -r 1e33261a814f xen/include/asm-x86/cpufeature.h
--- a/xen/include/asm-x86/cpufeature.h  Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/include/asm-x86/cpufeature.h  Mon Sep 28 13:26:11 2009 -0600
@@ -74,9 +74,10 @@
 #define X86_FEATURE_P3         (3*32+ 6) /* P3 */
 #define X86_FEATURE_P4         (3*32+ 7) /* P4 */
 #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_NOSTOP_TSC (3*32+ 9) /* TSC does not stop in C states */
+#define X86_FEATURE_NONSTOP_TSC        (3*32+ 9) /* TSC does not stop in C 
states */
 #define X86_FEATURE_ARAT       (3*32+ 10) /* Always running APIC timer */
 #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_TSC_RELIABLE (3*32+12) /* TSC is known to be reliable */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3       (4*32+ 0) /* Streaming SIMD Extensions-3 */
diff -r 1e33261a814f xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h        Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/include/asm-x86/time.h        Mon Sep 28 13:26:11 2009 -0600
@@ -43,4 +43,6 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns)
 
 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
 
+void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp);
+
 #endif /* __X86_TIME_H__ */

Attachment: tsc-reliable.patch
Description: Binary data

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel