# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Date 1184172490 -3600
# Node ID c3929e540632ec30c19d85a2884f1e49081b8410
# Parent e00547dcda097c10e4c1390f0e2873deee741c0c
Provide cpu hotplug support to Xen. Note this hotplug
support is specific to PM, instead of for a run-time
single CPU hotplug which can be a separate task. See
embedded comment:
/*
* XXX: One important thing missed here is to migrate vcpus
* from dead cpu to other online ones and then put whole
* system into a stop state. It assures a safe environment
* for a cpu hotplug/remove at normal running state.
*
* However for xen PM case, at this point:
* -> All other domains should be notified with PM event,
* and then in following states:
* * Suspend state, or
* * Paused state, which is a force step to all
* domains if they do nothing to suspend
* -> All vcpus of dom0 (except vcpu0) have already beem
* hot removed
* with the net effect that all other cpus only have idle vcpu
* running. In this special case, we can avoid vcpu migration
* then and system can be considered in a stop state.
*
* So current cpu hotplug is a special version for PM specific
* usage, and need more effort later for full cpu hotplug.
* (ktian1)
*/
Signed-off-by: Kevin Tian <kevin.tian@xxxxxxxxx>
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
---
xen/arch/x86/cpu/common.c | 8 -
xen/arch/x86/domain.c | 29 ++---
xen/arch/x86/i8259.c | 1
xen/arch/x86/io_apic.c | 3
xen/arch/x86/irq.c | 62 +++++-------
xen/arch/x86/smp.c | 10 -
xen/arch/x86/smpboot.c | 218 +++++++++++++++++++++++++++++--------------
xen/include/asm-x86/config.h | 3
xen/include/asm-x86/smp.h | 13 ++
xen/include/asm-x86/system.h | 2
10 files changed, 213 insertions(+), 136 deletions(-)
diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/cpu/common.c
--- a/xen/arch/x86/cpu/common.c Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/arch/x86/cpu/common.c Wed Jul 11 17:48:10 2007 +0100
@@ -600,9 +600,5 @@ void __cpuinit cpu_uninit(void)
{
int cpu = raw_smp_processor_id();
cpu_clear(cpu, cpu_initialized);
-
- /* lazy TLB state */
- per_cpu(cpu_tlbstate, cpu).state = 0;
- per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-#endif
+}
+#endif
diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/arch/x86/domain.c Wed Jul 11 17:48:10 2007 +0100
@@ -81,24 +81,23 @@ static void default_idle(void)
/* We don't actually take CPU down, just spin without interrupts. */
static inline void play_dead(void)
{
- /* This must be done before dead CPU ack */
- cpu_exit_clear();
- wbinvd();
- mb();
- /* Ack it */
- __get_cpu_var(cpu_state) = CPU_DEAD;
-
- /*
- * With physical CPU hotplug, we should halt the cpu
- */
- local_irq_disable();
- while (1)
- halt();
+ __cpu_disable();
+ /* This must be done before dead CPU ack */
+ cpu_exit_clear();
+ wbinvd();
+ mb();
+ /* Ack it */
+ __get_cpu_var(cpu_state) = CPU_DEAD;
+
+ /* With physical CPU hotplug, we should halt the cpu. */
+ local_irq_disable();
+ for ( ; ; )
+ halt();
}
#else
static inline void play_dead(void)
{
- BUG();
+ BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */
@@ -106,6 +105,8 @@ void idle_loop(void)
{
for ( ; ; )
{
+ if (cpu_is_offline(smp_processor_id()))
+ play_dead();
page_scrub_schedule_work();
default_idle();
do_softirq();
diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/i8259.c
--- a/xen/arch/x86/i8259.c Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/arch/x86/i8259.c Wed Jul 11 17:48:10 2007 +0100
@@ -396,6 +396,7 @@ void __init init_IRQ(void)
irq_desc[i].action = NULL;
irq_desc[i].depth = 1;
spin_lock_init(&irq_desc[i].lock);
+ cpus_setall(irq_desc[i].affinity);
set_intr_gate(i, interrupt[i]);
}
diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/io_apic.c
--- a/xen/arch/x86/io_apic.c Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/arch/x86/io_apic.c Wed Jul 11 17:48:10 2007 +0100
@@ -34,9 +34,6 @@
#include <asm/desc.h>
#include <mach_apic.h>
#include <io_ports.h>
-
-#define set_irq_info(irq, mask) ((void)0)
-#define set_native_irq_info(irq, mask) ((void)0)
/* Different to Linux: our implementation can be simpler. */
#define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq)))
diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/arch/x86/irq.c Wed Jul 11 17:48:10 2007 +0100
@@ -656,42 +656,34 @@ __initcall(setup_dump_irqs);
__initcall(setup_dump_irqs);
#ifdef CONFIG_HOTPLUG_CPU
-#include <mach_apic.h>
+#include <asm/mach-generic/mach_apic.h>
+#include <xen/delay.h>
void fixup_irqs(cpumask_t map)
{
- unsigned int irq;
- static int warned;
-
- for (irq = 0; irq < NR_IRQS; irq++) {
- cpumask_t mask;
- if (irq == 2)
- continue;
-
- cpus_and(mask, irq_desc[irq].affinity, map);
- if (any_online_cpu(mask) == NR_CPUS) {
- printk("Breaking affinity for irq %i\n", irq);
- mask = map;
- }
- if (irq_desc[irq].chip->set_affinity)
- irq_desc[irq].chip->set_affinity(irq, mask);
- else if (irq_desc[irq].action && !(warned++))
- printk("Cannot set affinity for irq %i\n", irq);
- }
-
-#if 0
- barrier();
- /* Ingo Molnar says: "after the IO-APIC masks have been redirected
- [note the nop - the interrupt-enable boundary on x86 is two
- instructions from sti] - to flush out pending hardirqs and
- IPIs. After this point nothing is supposed to reach this CPU." */
- __asm__ __volatile__("sti; nop; cli");
- barrier();
-#else
- /* That doesn't seem sufficient. Give it 1ms. */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
+ unsigned int irq;
+ static int warned;
+
+ for ( irq = 0; irq < NR_IRQS; irq++ )
+ {
+ cpumask_t mask;
+ if ( irq == 2 )
+ continue;
+
+ cpus_and(mask, irq_desc[irq].affinity, map);
+ if ( any_online_cpu(mask) == NR_CPUS )
+ {
+ printk("Breaking affinity for irq %i\n", irq);
+ mask = map;
+ }
+ if ( irq_desc[irq].handler->set_affinity )
+ irq_desc[irq].handler->set_affinity(irq, mask);
+ else if ( irq_desc[irq].action && !(warned++) )
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+}
#endif
-}
-#endif
diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/smp.c
--- a/xen/arch/x86/smp.c Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/arch/x86/smp.c Wed Jul 11 17:48:10 2007 +0100
@@ -256,16 +256,6 @@ static DEFINE_SPINLOCK(call_lock);
static DEFINE_SPINLOCK(call_lock);
static struct call_data_struct *call_data;
-void lock_ipi_call_lock(void)
-{
- spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
- spin_unlock_irq(&call_lock);
-}
-
int smp_call_function(
void (*func) (void *info),
void *info,
diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/arch/x86/smpboot.c Wed Jul 11 17:48:10 2007 +0100
@@ -110,6 +110,11 @@ EXPORT_SYMBOL(x86_cpu_to_apicid);
EXPORT_SYMBOL(x86_cpu_to_apicid);
static void map_cpu_to_logical_apicid(void);
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+static void *stack_base[NR_CPUS] __cacheline_aligned;
+spinlock_t cpu_add_remove_lock;
/*
* The bootstrap kernel entry code has set these up. Save them for
@@ -396,9 +401,11 @@ void __devinit smp_callin(void)
/*
* Synchronize the TSC with the BP
*/
- if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
+ if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) {
synchronize_tsc_ap();
- calibrate_tsc_ap();
+ /* No sync for same reason as above */
+ calibrate_tsc_ap();
+ }
}
static int cpucount, booting_cpu;
@@ -464,8 +471,12 @@ static void construct_percpu_idt(unsigne
{
unsigned char idt_load[10];
- idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
- memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES*sizeof(idt_entry_t));
+ /* If IDT table exists since last hotplug, reuse it */
+ if (!idt_tables[cpu]) {
+ idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
+ memcpy(idt_tables[cpu], idt_table,
+ IDT_ENTRIES*sizeof(idt_entry_t));
+ }
*(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1;
*(unsigned long *)(&idt_load[2]) = (unsigned long)idt_tables[cpu];
@@ -488,7 +499,7 @@ void __devinit start_secondary(void *unu
set_processor_id(cpu);
set_current(idle_vcpu[cpu]);
- this_cpu(curr_vcpu) = idle_vcpu[cpu];
+ this_cpu(curr_vcpu) = idle_vcpu[cpu];
percpu_traps_init();
@@ -516,23 +527,13 @@ void __devinit start_secondary(void *unu
set_cpu_sibling_map(raw_smp_processor_id());
wmb();
- /*
- * We need to hold call_lock, so there is no inconsistency
- * between the time smp_call_function() determines number of
- * IPI receipients, and the time when the determination is made
- * for which cpus receive the IPI. Holding this
- * lock helps us to not include this cpu in a currently in progress
- * smp_call_function().
- */
- /*lock_ipi_call_lock();*/
cpu_set(smp_processor_id(), cpu_online_map);
- /*unlock_ipi_call_lock();*/
- /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+ per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+ init_percpu_time();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
-
- init_percpu_time();
wmb();
startup_cpu_idle_loop();
@@ -794,6 +795,22 @@ static inline int alloc_cpu_id(void)
return cpu;
}
+static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
+{
+ if (idle_vcpu[cpu])
+ return idle_vcpu[cpu];
+
+ return alloc_idle_vcpu(cpu);
+}
+
+static void *prepare_idle_stack(unsigned int cpu)
+{
+ if (!stack_base[cpu])
+ stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER);
+
+ return stack_base[cpu];
+}
+
static int __devinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -811,7 +828,7 @@ static int __devinit do_boot_cpu(int api
booting_cpu = cpu;
- v = alloc_idle_vcpu(cpu);
+ v = prepare_idle_vcpu(cpu);
BUG_ON(v == NULL);
/* start_eip had better be page-aligned! */
@@ -820,7 +837,7 @@ static int __devinit do_boot_cpu(int api
/* So we see what's up */
printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
- stack_start.esp = alloc_xenheap_pages(STACK_ORDER);
+ stack_start.esp = prepare_idle_stack(cpu);
/* Debug build: detect stack overflow by setting up a guard page. */
memguard_guard_stack(stack_start.esp);
@@ -898,6 +915,12 @@ static int __devinit do_boot_cpu(int api
}
#ifdef CONFIG_HOTPLUG_CPU
+static void idle_task_exit(void)
+{
+ /* Give up lazy state borrowed by this idle vcpu */
+ __sync_lazy_execstate();
+}
+
void cpu_exit_clear(void)
{
int cpu = raw_smp_processor_id();
@@ -906,7 +929,6 @@ void cpu_exit_clear(void)
cpucount --;
cpu_uninit();
- irq_ctx_exit(cpu);
cpu_clear(cpu, cpu_callout_map);
cpu_clear(cpu, cpu_callin_map);
@@ -915,26 +937,9 @@ void cpu_exit_clear(void)
unmap_cpu_to_logical_apicid(cpu);
}
-struct warm_boot_cpu_info {
- struct completion *complete;
- int apicid;
- int cpu;
-};
-
-static void __cpuinit do_warm_boot_cpu(void *p)
-{
- struct warm_boot_cpu_info *info = p;
- do_boot_cpu(info->apicid, info->cpu);
- complete(info->complete);
-}
-
static int __cpuinit __smp_prepare_cpu(int cpu)
{
- DECLARE_COMPLETION(done);
- struct warm_boot_cpu_info info;
- struct work_struct task;
int apicid, ret;
- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
apicid = x86_cpu_to_apicid[cpu];
if (apicid == BAD_APICID) {
@@ -942,34 +947,12 @@ static int __cpuinit __smp_prepare_cpu(i
goto exit;
}
- /*
- * the CPU isn't initialized at boot time, allocate gdt table here.
- * cpu_init will initialize it
- */
- if (!cpu_gdt_descr->address) {
- cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
- if (!cpu_gdt_descr->address)
- printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
- ret = -ENOMEM;
- goto exit;
- }
-
- info.complete = &done;
- info.apicid = apicid;
- info.cpu = cpu;
- INIT_WORK(&task, do_warm_boot_cpu, &info);
-
tsc_sync_disabled = 1;
- /* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
- KERNEL_PGD_PTRS);
- flush_tlb_all();
- schedule_work(&task);
- wait_for_completion(&done);
+ do_boot_cpu(apicid, cpu);
tsc_sync_disabled = 0;
- zap_low_mappings();
+
ret = 0;
exit:
return ret;
@@ -1002,6 +985,8 @@ static void __init smp_boot_cpus(unsigne
boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+
+ stack_base[0] = stack_start.esp;
/*current_thread_info()->cpu = 0;*/
/*smp_tune_scheduling();*/
@@ -1173,7 +1158,8 @@ void __devinit smp_prepare_boot_cpu(void
cpu_set(smp_processor_id(), cpu_callout_map);
cpu_set(smp_processor_id(), cpu_present_map);
cpu_set(smp_processor_id(), cpu_possible_map);
- /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+ per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ spin_lock_init(&cpu_add_remove_lock);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1196,11 +1182,12 @@ remove_siblinginfo(int cpu)
cpu_clear(cpu, cpu_sibling_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
- c[cpu].phys_proc_id = 0;
- c[cpu].cpu_core_id = 0;
+ phys_proc_id[cpu] = BAD_APICID;
+ cpu_core_id[cpu] = BAD_APICID;
cpu_clear(cpu, cpu_sibling_setup_map);
}
+extern void fixup_irqs(cpumask_t map);
int __cpu_disable(void)
{
cpumask_t map = cpu_online_map;
@@ -1217,12 +1204,15 @@ int __cpu_disable(void)
if (cpu == 0)
return -EBUSY;
+ local_irq_disable();
clear_local_APIC();
/* Allow any queued timer interrupts to get serviced */
local_irq_enable();
mdelay(1);
local_irq_disable();
+ time_suspend();
+
remove_siblinginfo(cpu);
cpu_clear(cpu, map);
@@ -1241,13 +1231,89 @@ void __cpu_die(unsigned int cpu)
/* They ack this in play_dead by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
printk ("CPU %d is now offline\n", cpu);
- if (1 == num_online_cpus())
- alternatives_smp_switch(0);
return;
}
- msleep(100);
+ mdelay(100);
+ mb();
+ process_pending_timers();
}
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+/*
+ * XXX: One important thing missed here is to migrate vcpus
+ * from dead cpu to other online ones and then put whole
+ * system into a stop state. It assures a safe environment
+ * for a cpu hotplug/remove at normal running state.
+ *
+ * However for xen PM case, at this point:
+ * -> All other domains should be notified with PM event,
+ * and then in following states:
+ * * Suspend state, or
+ * * Paused state, which is a force step to all
+ * domains if they do nothing to suspend
+ * -> All vcpus of dom0 (except vcpu0) have already beem
+ * hot removed
+ * with the net effect that all other cpus only have idle vcpu
+ * running. In this special case, we can avoid vcpu migration
+ * then and system can be considered in a stop state.
+ *
+ * So current cpu hotplug is a special version for PM specific
+ * usage, and need more effort later for full cpu hotplug.
+ * (ktian1)
+ */
+int cpu_down(unsigned int cpu)
+{
+ int err = 0;
+ cpumask_t mask;
+
+ spin_lock(&cpu_add_remove_lock);
+ if (num_online_cpus() == 1) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (!cpu_online(cpu)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ printk("Prepare to bring CPU%d down...\n", cpu);
+ /* Send notification to remote idle vcpu */
+ cpus_clear(mask);
+ cpu_set(cpu, mask);
+ per_cpu(cpu_state, cpu) = CPU_DYING;
+ smp_send_event_check_mask(mask);
+
+ __cpu_die(cpu);
+
+ if (cpu_online(cpu)) {
+ printk("Bad state (DEAD, but in online map) on CPU%d\n", cpu);
+ err = -EBUSY;
+ }
+out:
+ spin_unlock(&cpu_add_remove_lock);
+ return err;
+}
+
+int cpu_up(unsigned int cpu)
+{
+ int err = 0;
+
+ spin_lock(&cpu_add_remove_lock);
+ if (cpu_online(cpu)) {
+ printk("Bring up a online cpu. Bogus!\n");
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = __cpu_up(cpu);
+ if (err < 0)
+ goto out;
+
+out:
+ spin_unlock(&cpu_add_remove_lock);
+ return err;
}
/* From kernel/power/main.c */
@@ -1308,6 +1374,22 @@ void __cpu_die(unsigned int cpu)
int __devinit __cpu_up(unsigned int cpu)
{
+#ifdef CONFIG_HOTPLUG_CPU
+ int ret=0;
+
+ /*
+ * We do warm boot only on cpus that had booted earlier
+ * Otherwise cold boot is all handled from smp_boot_cpus().
+ * cpu_callin_map is set during AP kickstart process. Its reset
+ * when a cpu is taken offline from cpu_exit_clear().
+ */
+ if (!cpu_isset(cpu, cpu_callin_map))
+ ret = __smp_prepare_cpu(cpu);
+
+ if (ret)
+ return -EIO;
+#endif
+
/* In case one didn't come up */
if (!cpu_isset(cpu, cpu_callin_map)) {
printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
diff -r e00547dcda09 -r c3929e540632 xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/include/asm-x86/config.h Wed Jul 11 17:48:10 2007 +0100
@@ -39,6 +39,9 @@
#define CONFIG_ACPI_SRAT 1
#define CONFIG_VGA 1
+
+#define CONFIG_HOTPLUG 1
+#define CONFIG_HOTPLUG_CPU 1
#define HZ 100
diff -r e00547dcda09 -r c3929e540632 xen/include/asm-x86/smp.h
--- a/xen/include/asm-x86/smp.h Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/include/asm-x86/smp.h Wed Jul 11 17:48:10 2007 +0100
@@ -50,9 +50,22 @@ extern u8 x86_cpu_to_apicid[];
#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
+/* State of each CPU. */
+#define CPU_ONLINE 0x0002 /* CPU is up */
+#define CPU_DYING 0x0003 /* CPU is requested to die */
+#define CPU_DEAD 0x0004 /* CPU is dead */
+DECLARE_PER_CPU(int, cpu_state);
+
#ifdef CONFIG_HOTPLUG_CPU
+#define cpu_is_offline(cpu) unlikely(per_cpu(cpu_state,cpu) == CPU_DYING)
+extern int cpu_down(unsigned int cpu);
+extern int cpu_up(unsigned int cpu);
extern void cpu_exit_clear(void);
extern void cpu_uninit(void);
+extern void disable_nonboot_cpus(void);
+extern void enable_nonboot_cpus(void);
+#else
+static inline int cpu_is_offline(int cpu) {return 0;}
#endif
/*
diff -r e00547dcda09 -r c3929e540632 xen/include/asm-x86/system.h
--- a/xen/include/asm-x86/system.h Wed Jul 11 17:28:09 2007 +0100
+++ b/xen/include/asm-x86/system.h Wed Jul 11 17:48:10 2007 +0100
@@ -313,6 +313,8 @@ static always_inline unsigned long long
#define __sti() __asm__ __volatile__("sti": : :"memory")
/* used in the idle loop; sti takes one instruction cycle to complete */
#define safe_halt() __asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt() __asm__ __volatile__("hlt": : :"memory")
/* For spinlocks etc */
#if defined(__i386__)
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|