[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 2/2] x86: introduce MWAIT-based, ACPI-less CPU idle driver



This is a port of Linux'es intel-idle driver serving the same purpose.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -620,6 +620,14 @@ limit is ignored by Xen.
 
 Specify if the MMConfig space should be enabled.
 
+### mwait-idle
+> `= <boolean>`
+
+> Default: `true`
+
+Use the MWAIT idle driver (with model specific C-state knowledge) instead
+of the ACPI based one.
+
 ### nmi
 > `= ignore | dom0 | fatal`
 
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -39,7 +39,6 @@
 #include <xen/smp.h>
 #include <xen/guest_access.h>
 #include <xen/keyhandler.h>
-#include <xen/cpuidle.h>
 #include <xen/trace.h>
 #include <xen/sched-if.h>
 #include <xen/irq.h>
@@ -54,6 +53,8 @@
 #include <public/sysctl.h>
 #include <acpi/cpufreq/cpufreq.h>
 #include <asm/apic.h>
+#include <asm/cpuidle.h>
+#include <asm/mwait.h>
 #include <xen/notifier.h>
 #include <xen/cpu.h>
 
@@ -70,18 +71,18 @@
 #define GET_CC7_RES(val)  GET_HW_RES_IN_NS(0x3FE, val) /* SNB only */
 
 static void lapic_timer_nop(void) { }
-static void (*lapic_timer_off)(void);
-static void (*lapic_timer_on)(void);
+void (*__read_mostly lapic_timer_off)(void);
+void (*__read_mostly lapic_timer_on)(void);
 
 static uint64_t (*__read_mostly tick_to_ns)(uint64_t) = acpi_pm_tick_to_ns;
 
-static void (*pm_idle_save) (void) __read_mostly;
+void (*__read_mostly pm_idle_save)(void);
 unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER - 1;
 integer_param("max_cstate", max_cstate);
 static bool_t __read_mostly local_apic_timer_c2_ok;
 boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok);
 
-static struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
+struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
 
 struct hw_residencies
 {
@@ -236,12 +237,10 @@ static uint64_t acpi_pm_ticks_elapsed(ui
         return ((0xFFFFFFFF - t1) + t2 +1);
 }
 
-static uint64_t (*__read_mostly get_tick)(void) = get_acpi_pm_tick;
+uint64_t (*__read_mostly cpuidle_get_tick)(void) = get_acpi_pm_tick;
 static uint64_t (*__read_mostly ticks_elapsed)(uint64_t, uint64_t)
     = acpi_pm_ticks_elapsed;
 
-#define MWAIT_ECX_INTERRUPT_BREAK   (0x1)
-
 /*
  * The bit is set iff cpu use monitor/mwait to enter C state
  * with this flag set, CPU can be waken up from C state
@@ -263,7 +262,7 @@ void cpuidle_wakeup_mwait(cpumask_t *mas
     cpumask_andnot(mask, mask, &target);
 }
 
-static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
 {
     unsigned int cpu = smp_processor_id();
     s_time_t expires = per_cpu(timer_deadline, cpu);
@@ -334,7 +333,7 @@ static struct {
     unsigned int count;
 } c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED };
 
-static inline void trace_exit_reason(u32 *irq_traced)
+void trace_exit_reason(u32 *irq_traced)
 {
     if ( unlikely(tb_init_done) )
     {
@@ -354,15 +353,6 @@ static inline void trace_exit_reason(u32
     }
 }
 
-/* vcpu is urgent if vcpu is polling event channel
- *
- * if urgent vcpu exists, CPU should not enter deep C state
- */
-static int sched_has_urgent_vcpu(void)
-{
-    return atomic_read(&this_cpu(schedule_data).urgent_count);
-}
-
 /*
  * "AAJ72. EOI Transaction May Not be Sent if Software Enters Core C6 During 
  * an Interrupt Service Routine"
@@ -388,10 +378,11 @@ bool_t errata_c6_eoi_workaround(void)
     return (fix_needed && cpu_has_pending_apic_eoi());
 }
 
-static inline void acpi_update_idle_stats(struct acpi_processor_power *power,
-                                          struct acpi_processor_cx *cx,
-                                          int64_t sleep_ticks)
+void update_idle_stats(struct acpi_processor_power *power,
+                       struct acpi_processor_cx *cx,
+                       uint64_t before, uint64_t after)
 {
+    int64_t sleep_ticks = ticks_elapsed(before, after);
     /* Interrupts are disabled */
 
     spin_lock(&power->stat_lock);
@@ -472,19 +463,19 @@ static void acpi_processor_idle(void)
         if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
         {
             /* Get start time (ticks) */
-            t1 = get_tick();
+            t1 = cpuidle_get_tick();
             /* Trace cpu idle entry */
             TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
             /* Invoke C2 */
             acpi_idle_do_entry(cx);
             /* Get end time (ticks) */
-            t2 = get_tick();
+            t2 = cpuidle_get_tick();
             trace_exit_reason(irq_traced);
             /* Trace cpu idle exit */
             TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
                      irq_traced[0], irq_traced[1], irq_traced[2], 
irq_traced[3]);
             /* Update statistics */
-            acpi_update_idle_stats(power, cx, ticks_elapsed(t1, t2));
+            update_idle_stats(power, cx, t1, t2);
             /* Re-enable interrupts */
             local_irq_enable();
             break;
@@ -500,7 +491,7 @@ static void acpi_processor_idle(void)
         lapic_timer_off();
 
         /* Get start time (ticks) */
-        t1 = get_tick();
+        t1 = cpuidle_get_tick();
         /* Trace cpu idle entry */
         TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
 
@@ -549,7 +540,7 @@ static void acpi_processor_idle(void)
         }
 
         /* Get end time (ticks) */
-        t2 = get_tick();
+        t2 = cpuidle_get_tick();
 
         /* recovering TSC */
         cstate_restore_tsc();
@@ -559,7 +550,7 @@ static void acpi_processor_idle(void)
                  irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
 
         /* Update statistics */
-        acpi_update_idle_stats(power, cx, ticks_elapsed(t1, t2));
+        update_idle_stats(power, cx, t1, t2);
         /* Re-enable interrupts */
         local_irq_enable();
         /* recovering APIC */
@@ -586,7 +577,7 @@ static void acpi_processor_idle(void)
         cpuidle_current_governor->reflect(power);
 }
 
-static void acpi_dead_idle(void)
+void acpi_dead_idle(void)
 {
     struct acpi_processor_power *power;
     struct acpi_processor_cx *cx;
@@ -649,7 +640,7 @@ default_halt:
         halt();
 }
 
-static int cpuidle_init_cpu(int cpu)
+int cpuidle_init_cpu(unsigned int cpu)
 {
     struct acpi_processor_power *acpi_power;
 
@@ -660,7 +651,7 @@ static int cpuidle_init_cpu(int cpu)
 
         if ( cpu == 0 && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
         {
-            get_tick = get_stime_tick;
+            cpuidle_get_tick = get_stime_tick;
             ticks_elapsed = stime_ticks_elapsed;
             tick_to_ns = stime_tick_to_ns;
         }
@@ -685,9 +676,6 @@ static int cpuidle_init_cpu(int cpu)
     return 0;
 }
 
-#define MWAIT_SUBSTATE_MASK (0xf)
-#define MWAIT_SUBSTATE_SIZE (4)
-
 static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
 {
     struct cpuinfo_x86 *c = &current_cpu_data;
@@ -1026,6 +1014,9 @@ long set_cx_pminfo(uint32_t cpu, struct 
     if ( unlikely(!guest_handle_okay(power->states, power->count)) )
         return -EFAULT;
 
+    if ( pm_idle_save && pm_idle != acpi_processor_idle )
+        return 0;
+
     print_cx_pminfo(cpu, power);
 
     /* map from acpi_id to cpu_id */
@@ -1195,7 +1186,12 @@ static struct notifier_block cpu_nfb = {
 static int __init cpuidle_presmp_init(void)
 {
     void *cpu = (void *)(long)smp_processor_id();
-    cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+
+    if ( !xen_cpuidle )
+        return 0;
+
+    mwait_idle_init(&cpu_nfb);
+    cpu_nfb.notifier_call(&cpu_nfb, CPU_ONLINE, cpu);
     register_cpu_notifier(&cpu_nfb);
     return 0;
 }
--- a/xen/arch/x86/cpu/Makefile
+++ b/xen/arch/x86/cpu/Makefile
@@ -5,6 +5,7 @@ obj-y += amd.o
 obj-y += common.o
 obj-y += intel.o
 obj-y += intel_cacheinfo.o
+obj-y += mwait-idle.o
 
 # Keeping around for VIA support (JBeulich)
 # obj-$(x86_32) += centaur.o
--- /dev/null
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -0,0 +1,513 @@
+/*
+ * mwait_idle.c - native hardware idle loop for modern processors
+ *
+ * Copyright (c) 2010, Intel Corporation.
+ * Len Brown <len.brown@xxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * mwait_idle is a cpuidle driver that loads on specific processors
+ * in lieu of the legacy ACPI processor_idle driver.  The intent is to
+ * make Linux more efficient on these processors, as mwait_idle knows
+ * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
+ */
+
+/*
+ * Design Assumptions
+ *
+ * All CPUs have same idle states as boot CPU
+ *
+ * Chipset BM_STS (bus master status) bit is a NOP
+ *     for preventing entry into deep C-states
+ */
+
+/*
+ * Known limitations
+ *
+ * The driver currently initializes for_each_online_cpu() upon load.
+ * It it unaware of subsequent processors hot-added to the system.
+ * This means that if you boot with maxcpus=n and later online
+ * processors above n, those processors will use C1 only.
+ *
+ * ACPI has a .suspend hack to turn off deep C-states during suspend
+ * to avoid complications with the lapic timer workaround.
+ * Have not seen issues with suspend, but may need same workaround here.
+ */
+
+/* un-comment DEBUG to enable pr_debug() statements */
+#define DEBUG
+
+#include <xen/lib.h>
+#include <xen/cpu.h>
+#include <xen/init.h>
+#include <xen/softirq.h>
+#include <xen/trace.h>
+#include <asm/cpuidle.h>
+#include <asm/mwait.h>
+#include <asm/msr.h>
+#include <acpi/cpufreq/cpufreq.h>
+
+#define MWAIT_IDLE_VERSION "0.4"
+#undef PREFIX
+#define PREFIX "mwait-idle: "
+
+#ifdef DEBUG
+# define pr_debug(fmt...) printk(KERN_DEBUG fmt)
+#else
+# define pr_debug(fmt...)
+#endif
+
+static __initdata bool_t no_mwait_idle;
+invbool_param("mwait-idle", no_mwait_idle);
+
+static unsigned int mwait_substates;
+
+#define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
+/* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
+static unsigned int lapic_timer_reliable_states = (1 << 1);
+
+struct idle_cpu {
+       const struct cpuidle_state *state_table;
+
+       /*
+        * Hardware C-state auto-demotion may not always be optimal.
+        * Indicate which enable bits to clear here.
+        */
+       unsigned long auto_demotion_disable_flags;
+};
+
+static const struct idle_cpu *icpu;
+
+static const struct cpuidle_state {
+       char            name[16];
+       unsigned int    flags;
+       unsigned int    exit_latency; /* in US */
+       int             power_usage; /* in mW */
+       unsigned int    target_residency; /* in US */
+} *cpuidle_state_table;
+
+/*
+ * Set this flag for states where the HW flushes the TLB for us
+ * and so we don't need cross-calls to keep it consistent.
+ * If this flag is set, SW flushes the TLB, so even if the
+ * HW doesn't do the flushing, this flag is safe to use.
+ */
+#define CPUIDLE_FLAG_TLB_FLUSHED       0x10000
+
+/*
+ * States are indexed by the cstate number,
+ * which is also the index into the MWAIT hint array.
+ * Thus C0 is a dummy.
+ */
+static const struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = {
+       { /* MWAIT C0 */ },
+       { /* MWAIT C1 */
+               .name = "C1-NHM",
+               .exit_latency = 3,
+               .target_residency = 6,
+       },
+       { /* MWAIT C2 */
+               .name = "C3-NHM",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 20,
+               .target_residency = 80,
+       },
+       { /* MWAIT C3 */
+               .name = "C6-NHM",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 200,
+               .target_residency = 800,
+       }
+};
+
+static const struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = {
+       { /* MWAIT C0 */ },
+       { /* MWAIT C1 */
+               .name = "C1-SNB",
+               .exit_latency = 1,
+               .target_residency = 1,
+       },
+       { /* MWAIT C2 */
+               .name = "C3-SNB",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 80,
+               .target_residency = 211,
+       },
+       { /* MWAIT C3 */
+               .name = "C6-SNB",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 104,
+               .target_residency = 345,
+       },
+       { /* MWAIT C4 */
+               .name = "C7-SNB",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 109,
+               .target_residency = 345,
+       }
+};
+
+static const struct cpuidle_state ivb_cstates[MWAIT_MAX_NUM_CSTATES] = {
+       { /* MWAIT C0 */ },
+       { /* MWAIT C1 */
+               .name = "C1-IVB",
+               .exit_latency = 1,
+               .target_residency = 1,
+       },
+       { /* MWAIT C2 */
+               .name = "C3-IVB",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 59,
+               .target_residency = 156,
+       },
+       { /* MWAIT C3 */
+               .name = "C6-IVB",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 80,
+               .target_residency = 300,
+       },
+       { /* MWAIT C4 */
+               .name = "C7-IVB",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 87,
+               .target_residency = 300,
+       }
+};
+
+static const struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = {
+       { /* MWAIT C0 */ },
+       { /* MWAIT C1 */
+               .name = "C1-ATM",
+               .exit_latency = 1,
+               .target_residency = 4,
+       },
+       { /* MWAIT C2 */
+               .name = "C2-ATM",
+               .exit_latency = 20,
+               .target_residency = 80,
+       },
+       { /* MWAIT C3 */ },
+       { /* MWAIT C4 */
+               .name = "C4-ATM",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 100,
+               .target_residency = 400,
+       },
+       { /* MWAIT C5 */ },
+       { /* MWAIT C6 */
+               .name = "C6-ATM",
+               .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 140,
+               .target_residency = 560,
+       }
+};
+
+static u32 get_driver_data(unsigned int cstate)
+{
+       static const u32 driver_data[] = {
+               [1] /* MWAIT C1 */ = 0x00,
+               [2] /* MWAIT C2 */ = 0x10,
+               [3] /* MWAIT C3 */ = 0x20,
+               [4] /* MWAIT C4 */ = 0x30,
+               [5] /* MWAIT C5 */ = 0x40,
+               [6] /* MWAIT C6 */ = 0x52,
+       };
+
+       return driver_data[cstate < ARRAY_SIZE(driver_data) ? cstate : 0];
+}
+
+static void mwait_idle(void)
+{
+       unsigned int cpu = smp_processor_id();
+       struct acpi_processor_power *power = processor_powers[cpu];
+       struct acpi_processor_cx *cx = NULL;
+       unsigned int eax, next_state, cstate;
+       u64 before, after;
+       u32 exp = 0, pred = 0, irq_traced[4] = { 0 };
+
+       if (max_cstate > 0 && power && !sched_has_urgent_vcpu() &&
+           (next_state = cpuidle_current_governor->select(power)) > 0) {
+               do {
+                       cx = &power->states[next_state];
+               } while (cx->type > max_cstate && --next_state);
+               if (!next_state)
+                       cx = NULL;
+               menu_get_trace_data(&exp, &pred);
+       }
+       if (!cx) {
+               if (pm_idle_save)
+                       pm_idle_save();
+               else
+                       safe_halt();
+               return;
+       }
+
+       cpufreq_dbs_timer_suspend();
+
+       sched_tick_suspend();
+       /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
+       process_pending_softirqs();
+
+       /* Interrupts must be disabled for C2 and higher transitions. */
+       local_irq_disable();
+
+       if (!cpu_is_haltable(cpu)) {
+               local_irq_enable();
+               sched_tick_resume();
+               cpufreq_dbs_timer_resume();
+               return;
+       }
+
+       power->last_state = cx;
+       eax = cx->address;
+       cstate = ((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
+
+#if 0 /* XXX Can we/do we need to do something similar on Xen? */
+       /*
+        * leave_mm() to avoid costly and often unnecessary wakeups
+        * for flushing the user TLB's associated with the active mm.
+        */
+       if (cpuidle_state_table[].flags & CPUIDLE_FLAG_TLB_FLUSHED)
+               leave_mm(cpu);
+#endif
+
+       if (!(lapic_timer_reliable_states & (1 << cstate)))
+               lapic_timer_off();
+
+       before = cpuidle_get_tick();
+       TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, before, exp, pred);
+
+       if (cpu_is_haltable(cpu))
+               mwait_idle_with_hints(eax, MWAIT_ECX_INTERRUPT_BREAK);
+
+       after = cpuidle_get_tick();
+
+       cstate_restore_tsc();
+       trace_exit_reason(irq_traced);
+       TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, after,
+               irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
+
+       update_idle_stats(power, cx, before, after);
+       local_irq_enable();
+
+       if (!(lapic_timer_reliable_states & (1 << cstate)))
+               lapic_timer_on();
+
+       /* Now back in C0. */
+       power->last_state = &power->states[0];
+
+       sched_tick_resume();
+       cpufreq_dbs_timer_resume();
+
+       if ( cpuidle_current_governor->reflect )
+               cpuidle_current_governor->reflect(power);
+}
+
+static void auto_demotion_disable(void *dummy)
+{
+       u64 msr_bits;
+
+       rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
+       msr_bits &= ~(icpu->auto_demotion_disable_flags);
+       wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
+}
+
+static const struct idle_cpu idle_cpu_nehalem = {
+       .state_table = nehalem_cstates,
+       .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
+};
+
+static const struct idle_cpu idle_cpu_atom = {
+       .state_table = atom_cstates,
+};
+
+static const struct idle_cpu idle_cpu_lincroft = {
+       .state_table = atom_cstates,
+       .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
+};
+
+static const struct idle_cpu idle_cpu_snb = {
+       .state_table = snb_cstates,
+};
+
+static const struct idle_cpu idle_cpu_ivb = {
+       .state_table = ivb_cstates,
+};
+
+#define ICPU(model, cpu) { 6, model, &idle_cpu_##cpu }
+
+static struct intel_idle_id {
+       unsigned int family, model;
+       const struct idle_cpu *data;
+} intel_idle_ids[] __initdata = {
+       ICPU(0x1a, nehalem),
+       ICPU(0x1e, nehalem),
+       ICPU(0x1f, nehalem),
+       ICPU(0x25, nehalem),
+       ICPU(0x2c, nehalem),
+       ICPU(0x2e, nehalem),
+       ICPU(0x2f, nehalem),
+       ICPU(0x1c, atom),
+       ICPU(0x26, lincroft),
+       ICPU(0x2a, snb),
+       ICPU(0x2d, snb),
+       ICPU(0x3a, ivb),
+       {}
+};
+
+static int __init mwait_idle_probe(void)
+{
+       unsigned int eax, ebx, ecx;
+       const struct intel_idle_id *id;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           !boot_cpu_has(X86_FEATURE_MWAIT) ||
+           boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+               return -ENODEV;
+
+       for (id = intel_idle_ids; id->family; ++id)
+               if (id->family == boot_cpu_data.x86 &&
+                   id->model == boot_cpu_data.x86_model)
+                       break;
+       if (!id->family) {
+               pr_debug(PREFIX "does not run on family %d model %d\n",
+                        boot_cpu_data.x86, boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+
+       cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+
+       if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+           !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
+           !mwait_substates)
+               return -ENODEV;
+
+       if (!max_cstate || no_mwait_idle) {
+               pr_debug(PREFIX "disabled\n");
+               return -EPERM;
+       }
+
+       pr_debug(PREFIX "MWAIT substates: %#x\n", mwait_substates);
+
+       icpu = id->data;
+       cpuidle_state_table = icpu->state_table;
+
+       if (boot_cpu_has(X86_FEATURE_ARAT))
+               lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
+
+       pr_debug(PREFIX "v" MWAIT_IDLE_VERSION " model %#x\n",
+                boot_cpu_data.x86_model);
+
+       pr_debug(PREFIX "lapic_timer_reliable_states %#x\n",
+                lapic_timer_reliable_states);
+       return 0;
+}
+
+static int mwait_idle_cpu_init(struct notifier_block *nfb,
+                              unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu, cstate;
+       struct acpi_processor_power *dev = processor_powers[cpu];
+
+       switch (action) {
+       default:
+               return NOTIFY_DONE;
+
+       case CPU_UP_PREPARE:
+               cpuidle_init_cpu(cpu);
+               return NOTIFY_DONE;
+
+       case CPU_ONLINE:
+               if (!dev)
+                       return NOTIFY_DONE;
+               break;
+       }
+
+       dev->count = 1;
+
+       for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) {
+               unsigned int num_substates;
+               struct acpi_processor_cx *cx;
+
+               if (cstate > max_cstate) {
+                       printk(PREFIX "max C-state %u reached\n", max_cstate);
+                       break;
+               }
+
+               /* Does the state exist in CPUID.MWAIT? */
+               num_substates = (mwait_substates >> (cstate * 4))
+                       & MWAIT_SUBSTATE_MASK;
+               if (!num_substates)
+                       continue;
+               /* Is the state not enabled? */
+               if (!cpuidle_state_table[cstate].target_residency) {
+                       /* does the driver not know about the state? */
+                       if (!pm_idle_save && !*cpuidle_state_table[cstate].name)
+                               pr_debug(PREFIX "unaware of family %#x model 
%#x MWAIT %u\n",
+                                        boot_cpu_data.x86,
+                                        boot_cpu_data.x86_model, cstate);
+                       continue;
+               }
+
+               if (dev->count >= ACPI_PROCESSOR_MAX_POWER) {
+                       printk(PREFIX "max C-state count of %u reached\n",
+                              ACPI_PROCESSOR_MAX_POWER);
+                       break;
+               }
+
+               if (cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) {
+                       if (pm_idle_save)
+                               continue;
+                       setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+               }
+
+               cx = dev->states + dev->count;
+               cx->type = cstate;
+               cx->address = get_driver_data(cstate);
+               cx->entry_method = ACPI_CSTATE_EM_FFH;
+               cx->power = cpuidle_state_table[cstate].power_usage;
+               cx->latency = cpuidle_state_table[cstate].exit_latency;
+               cx->target_residency =
+                       cpuidle_state_table[cstate].target_residency;
+
+               dev->count++;
+       }
+
+       if (icpu->auto_demotion_disable_flags)
+               on_selected_cpus(cpumask_of(cpu), auto_demotion_disable, NULL, 
1);
+
+       return NOTIFY_DONE;
+}
+
+int __init mwait_idle_init(struct notifier_block *nfb)
+{
+       int err;
+
+       if (pm_idle_save)
+               return -ENODEV;
+
+       err = mwait_idle_probe();
+       if (!err) {
+               nfb->notifier_call = mwait_idle_cpu_init;
+               mwait_idle_cpu_init(nfb, CPU_UP_PREPARE, NULL);
+
+               pm_idle_save = pm_idle;
+               pm_idle = mwait_idle;
+               dead_idle = acpi_dead_idle;
+       }
+
+       return err;
+}
--- /dev/null
+++ b/xen/include/asm-x86/cpuidle.h
@@ -0,0 +1,35 @@
+#ifndef __ASM_X86_CPUIDLE_H__
+#define __ASM_X86_CPUIDLE_H__
+
+#include <xen/cpuidle.h>
+#include <xen/notifier.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+
+extern struct acpi_processor_power *processor_powers[];
+
+extern void (*pm_idle_save)(void);
+
+extern void (*lapic_timer_off)(void);
+extern void (*lapic_timer_on)(void);
+
+extern uint64_t (*cpuidle_get_tick)(void);
+
+int mwait_idle_init(struct notifier_block *);
+int cpuidle_init_cpu(unsigned int cpu);
+void acpi_dead_idle(void);
+void trace_exit_reason(u32 *irq_traced);
+void update_idle_stats(struct acpi_processor_power *,
+                       struct acpi_processor_cx *, uint64_t, uint64_t);
+
+/*
+ * vcpu is urgent if vcpu is polling event channel
+ *
+ * if urgent vcpu exists, CPU should not enter deep C state
+ */
+static inline int sched_has_urgent_vcpu(void)
+{
+    return atomic_read(&this_cpu(schedule_data).urgent_count);
+}
+
+#endif /* __X86_ASM_CPUIDLE_H__ */
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -36,6 +36,11 @@
 #define MSR_IA32_PERFCTR1              0x000000c2
 #define MSR_FSB_FREQ                   0x000000cd
 
+#define MSR_NHM_SNB_PKG_CST_CFG_CTL    0x000000e2
+#define NHM_C3_AUTO_DEMOTE             (1UL << 25)
+#define NHM_C1_AUTO_DEMOTE             (1UL << 26)
+#define ATM_LNC_C6_AUTO_DEMOTE         (1UL << 25)
+
 #define MSR_MTRRcap                    0x000000fe
 #define MSR_IA32_BBL_CR_CTL            0x00000119
 
--- /dev/null
+++ b/xen/include/asm-x86/mwait.h
@@ -0,0 +1,17 @@
+#ifndef __ASM_X86_MWAIT_H__
+#define __ASM_X86_MWAIT_H__
+
+#define MWAIT_SUBSTATE_MASK            0xf
+#define MWAIT_CSTATE_MASK              0xf
+#define MWAIT_SUBSTATE_SIZE            4
+#define MWAIT_MAX_NUM_CSTATES          8
+
+#define CPUID_MWAIT_LEAF               5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK     0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK      0x1
+
+void mwait_idle_with_hints(unsigned int eax, unsigned int ecx);
+
+#endif /* __ASM_X86_MWAIT_H__ */


Attachment: x86-mwait-idle.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.