WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 5/10] Add Xen CPU hotplug support

To: "Keir Fraser" <keir@xxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH 5/10] Add Xen CPU hotplug support
From: "Tian, Kevin" <kevin.tian@xxxxxxxxx>
Date: Wed, 27 Jun 2007 21:35:58 +0800
Cc: xen-devel@xxxxxxxxxxxxxxxxxxx
Delivery-date: Wed, 27 Jun 2007 06:34:16 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: Ace4wBbjFXd9s277S1O9kS3pxpp18w==
Thread-topic: [PATCH 5/10] Add Xen CPU hotplug support
Provide cpu hotplug support to Xen. Note this hotplug
support is specific to PM, instead of for a run-time
single CPU hotplug which can be a separate task. See
embedded comment:

/*
 * XXX: One important thing missed here is to migrate vcpus
 * from dead cpu to other online ones and then put whole
 * system into a stop state. It assures a safe environment
 * for a cpu hotplug/remove at normal running state.
 *
 * However for xen PM case, at this point:
 *     -> All other domains should be notified with PM event,
 *        and then in following states:
 *             * Suspend state, or
 *             * Paused state, which is a force step to all
 *               domains if they do nothing to suspend
 *     -> All vcpus of dom0 (except vcpu0) have already beem
 *        hot removed
 * with the net effect that all other cpus only have idle vcpu
 * running. In this special case, we can avoid vcpu migration
 * then and system can be considered in a stop state.
 *
 * So current cpu hotplug is a special version for PM specific
 * usage, and need more effort later for full cpu hotplug.
 * (ktian1)
 */

Signed-off-by Kevin Tian <kevin.tian@xxxxxxxxx>

diff -r f15338af3e2a xen/arch/x86/cpu/common.c
--- a/xen/arch/x86/cpu/common.c Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/arch/x86/cpu/common.c Mon Jun 18 03:27:04 2007 -0400
@@ -600,9 +600,5 @@ void __cpuinit cpu_uninit(void)
 {
        int cpu = raw_smp_processor_id();
        cpu_clear(cpu, cpu_initialized);
-
-       /* lazy TLB state */
-       per_cpu(cpu_tlbstate, cpu).state = 0;
-       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-#endif
+}
+#endif
diff -r f15338af3e2a xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/arch/x86/domain.c     Mon Jun 18 03:27:04 2007 -0400
@@ -81,6 +81,7 @@ static void default_idle(void)
 /* We don't actually take CPU down, just spin without interrupts. */
 static inline void play_dead(void)
 {
+       __cpu_disable();
        /* This must be done before dead CPU ack */
        cpu_exit_clear();
        wbinvd();
@@ -106,6 +107,8 @@ void idle_loop(void)
 {
     for ( ; ; )
     {
+        if (cpu_is_offline(smp_processor_id()))
+            play_dead();
         page_scrub_schedule_work();
         default_idle();
         do_softirq();
diff -r f15338af3e2a xen/arch/x86/i8259.c
--- a/xen/arch/x86/i8259.c      Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/arch/x86/i8259.c      Mon Jun 18 03:27:04 2007 -0400
@@ -396,6 +396,7 @@ void __init init_IRQ(void)
         irq_desc[i].action  = NULL;
         irq_desc[i].depth   = 1;
         spin_lock_init(&irq_desc[i].lock);
+        cpus_setall(irq_desc[i].affinity);
         set_intr_gate(i, interrupt[i]);
     }
 
diff -r f15338af3e2a xen/arch/x86/io_apic.c
--- a/xen/arch/x86/io_apic.c    Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/arch/x86/io_apic.c    Mon Jun 18 03:27:04 2007 -0400
@@ -34,9 +34,6 @@
 #include <asm/desc.h>
 #include <mach_apic.h>
 #include <io_ports.h>
-
-#define set_irq_info(irq, mask) ((void)0)
-#define set_native_irq_info(irq, mask) ((void)0)
 
 /* Different to Linux: our implementation can be simpler. */
 #define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq)))
diff -r f15338af3e2a xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c        Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/arch/x86/irq.c        Mon Jun 18 03:27:04 2007 -0400
@@ -656,7 +656,8 @@ __initcall(setup_dump_irqs);
 __initcall(setup_dump_irqs);
 
 #ifdef CONFIG_HOTPLUG_CPU
-#include <mach_apic.h>
+#include <asm/mach-generic/mach_apic.h>
+#include <xen/delay.h>
 
 void fixup_irqs(cpumask_t map)
 {
@@ -673,8 +674,8 @@ void fixup_irqs(cpumask_t map)
                        printk("Breaking affinity for irq %i\n", irq);
                        mask = map;
                }
-               if (irq_desc[irq].chip->set_affinity)
-                       irq_desc[irq].chip->set_affinity(irq, mask);
+               if (irq_desc[irq].handler->set_affinity)
+                       irq_desc[irq].handler->set_affinity(irq, mask);
                else if (irq_desc[irq].action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
diff -r f15338af3e2a xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/arch/x86/smpboot.c    Mon Jun 18 03:32:00 2007 -0400
@@ -110,6 +110,11 @@ EXPORT_SYMBOL(x86_cpu_to_apicid);
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 
 static void map_cpu_to_logical_apicid(void);
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+static void *stack_base[NR_CPUS] __cacheline_aligned;
+spinlock_t cpu_add_remove_lock;
 
 /*
  * The bootstrap kernel entry code has set these up. Save them for
@@ -396,9 +401,11 @@ void __devinit smp_callin(void)
        /*
         *      Synchronize the TSC with the BP
         */
-       if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
+       if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) {
                synchronize_tsc_ap();
-       calibrate_tsc_ap();
+               /* No sync for same reason as above */
+               calibrate_tsc_ap();
+       }
 }
 
 static int cpucount, booting_cpu;
@@ -464,8 +471,12 @@ static void construct_percpu_idt(unsigne
 {
        unsigned char idt_load[10];
 
-       idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
-       memcpy(idt_tables[cpu], idt_table,
IDT_ENTRIES*sizeof(idt_entry_t));
+       /* If IDT table exists since last hotplug, reuse it */
+       if (!idt_tables[cpu]) {
+               idt_tables[cpu] = xmalloc_array(idt_entry_t,
IDT_ENTRIES);
+               memcpy(idt_tables[cpu], idt_table,
+                               IDT_ENTRIES*sizeof(idt_entry_t));
+       }
 
        *(unsigned short *)(&idt_load[0]) =
(IDT_ENTRIES*sizeof(idt_entry_t))-1;
        *(unsigned long  *)(&idt_load[2]) = (unsigned
long)idt_tables[cpu];
@@ -524,15 +535,15 @@ void __devinit start_secondary(void *unu
         * lock helps us to not include this cpu in a currently in
progress
         * smp_call_function().
         */
-       /*lock_ipi_call_lock();*/
+       lock_ipi_call_lock();
        cpu_set(smp_processor_id(), cpu_online_map);
-       /*unlock_ipi_call_lock();*/
-       /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+       unlock_ipi_call_lock();
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+        init_percpu_time();
 
        /* We can take interrupts now: we're officially "up". */
        local_irq_enable();
-
-        init_percpu_time();
 
        wmb();
        startup_cpu_idle_loop();
@@ -794,6 +805,22 @@ static inline int alloc_cpu_id(void)
        return cpu;
 }
 
+static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
+{
+       if (idle_vcpu[cpu])
+               return idle_vcpu[cpu];
+
+       return alloc_idle_vcpu(cpu);
+}
+
+static void *prepare_idle_stack(unsigned int cpu)
+{
+       if (!stack_base[cpu])
+               stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER);
+
+       return stack_base[cpu];
+}
+
 static int __devinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -811,7 +838,7 @@ static int __devinit do_boot_cpu(int api
 
        booting_cpu = cpu;
 
-       v = alloc_idle_vcpu(cpu);
+       v = prepare_idle_vcpu(cpu);
        BUG_ON(v == NULL);
 
        /* start_eip had better be page-aligned! */
@@ -820,7 +847,7 @@ static int __devinit do_boot_cpu(int api
        /* So we see what's up   */
        printk("Booting processor %d/%d eip %lx\n", cpu, apicid,
start_eip);
 
-       stack_start.esp = alloc_xenheap_pages(STACK_ORDER);
+       stack_start.esp = prepare_idle_stack(cpu);
 
        /* Debug build: detect stack overflow by setting up a guard
page. */
        memguard_guard_stack(stack_start.esp);
@@ -898,6 +925,12 @@ static int __devinit do_boot_cpu(int api
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+static void idle_task_exit(void)
+{
+       /* Give up lazy state borrowed by this idle vcpu */
+       __sync_lazy_execstate();
+}
+
 void cpu_exit_clear(void)
 {
        int cpu = raw_smp_processor_id();
@@ -906,7 +939,6 @@ void cpu_exit_clear(void)
 
        cpucount --;
        cpu_uninit();
-       irq_ctx_exit(cpu);
 
        cpu_clear(cpu, cpu_callout_map);
        cpu_clear(cpu, cpu_callin_map);
@@ -915,26 +947,9 @@ void cpu_exit_clear(void)
        unmap_cpu_to_logical_apicid(cpu);
 }
 
-struct warm_boot_cpu_info {
-       struct completion *complete;
-       int apicid;
-       int cpu;
-};
-
-static void __cpuinit do_warm_boot_cpu(void *p)
-{
-       struct warm_boot_cpu_info *info = p;
-       do_boot_cpu(info->apicid, info->cpu);
-       complete(info->complete);
-}
-
 static int __cpuinit __smp_prepare_cpu(int cpu)
 {
-       DECLARE_COMPLETION(done);
-       struct warm_boot_cpu_info info;
-       struct work_struct task;
        int     apicid, ret;
-       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr,
cpu);
 
        apicid = x86_cpu_to_apicid[cpu];
        if (apicid == BAD_APICID) {
@@ -942,34 +957,12 @@ static int __cpuinit __smp_prepare_cpu(i
                goto exit;
        }
 
-       /*
-        * the CPU isn't initialized at boot time, allocate gdt table
here.
-        * cpu_init will initialize it
-        */
-       if (!cpu_gdt_descr->address) {
-               cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
-               if (!cpu_gdt_descr->address)
-                       printk(KERN_CRIT "CPU%d failed to allocate
GDT\n", cpu);
-                       ret = -ENOMEM;
-                       goto exit;
-       }
-
-       info.complete = &done;
-       info.apicid = apicid;
-       info.cpu = cpu;
-       INIT_WORK(&task, do_warm_boot_cpu, &info);
-
        tsc_sync_disabled = 1;
 
-       /* init low mem mapping */
-       clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-                       KERNEL_PGD_PTRS);
-       flush_tlb_all();
-       schedule_work(&task);
-       wait_for_completion(&done);
+       do_boot_cpu(apicid, cpu);
 
        tsc_sync_disabled = 0;
-       zap_low_mappings();
+
        ret = 0;
 exit:
        return ret;
@@ -1002,6 +995,8 @@ static void __init smp_boot_cpus(unsigne
 
        boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
        x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+
+       stack_base[0] = stack_start.esp;
 
        /*current_thread_info()->cpu = 0;*/
        /*smp_tune_scheduling();*/
@@ -1173,7 +1168,8 @@ void __devinit smp_prepare_boot_cpu(void
        cpu_set(smp_processor_id(), cpu_callout_map);
        cpu_set(smp_processor_id(), cpu_present_map);
        cpu_set(smp_processor_id(), cpu_possible_map);
-       /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+       spin_lock_init(&cpu_add_remove_lock);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1196,11 +1192,12 @@ remove_siblinginfo(int cpu)
                cpu_clear(cpu, cpu_sibling_map[sibling]);
        cpus_clear(cpu_sibling_map[cpu]);
        cpus_clear(cpu_core_map[cpu]);
-       c[cpu].phys_proc_id = 0;
-       c[cpu].cpu_core_id = 0;
+       phys_proc_id[cpu] = BAD_APICID;
+       cpu_core_id[cpu] = BAD_APICID;
        cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
+extern void fixup_irqs(cpumask_t map);
 int __cpu_disable(void)
 {
        cpumask_t map = cpu_online_map;
@@ -1217,12 +1214,15 @@ int __cpu_disable(void)
        if (cpu == 0)
                return -EBUSY;
 
+       local_irq_disable();
        clear_local_APIC();
        /* Allow any queued timer interrupts to get serviced */
        local_irq_enable();
        mdelay(1);
        local_irq_disable();
 
+       destroy_percpu_time();
+
        remove_siblinginfo(cpu);
 
        cpu_clear(cpu, map);
@@ -1241,13 +1241,89 @@ void __cpu_die(unsigned int cpu)
                /* They ack this in play_dead by setting CPU_DEAD */
                if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
                        printk ("CPU %d is now offline\n", cpu);
-                       if (1 == num_online_cpus())
-                               alternatives_smp_switch(0);
                        return;
                }
-               msleep(100);
+               mdelay(100);
+               mb();
+               process_pending_timers();
        }
        printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+/* 
+ * XXX: One important thing missed here is to migrate vcpus
+ * from dead cpu to other online ones and then put whole
+ * system into a stop state. It assures a safe environment
+ * for a cpu hotplug/remove at normal running state.
+ *
+ * However for xen PM case, at this point:
+ *     -> All other domains should be notified with PM event,
+ *        and then in following states:
+ *             * Suspend state, or
+ *             * Paused state, which is a force step to all
+ *               domains if they do nothing to suspend
+ *     -> All vcpus of dom0 (except vcpu0) have already beem
+ *        hot removed
+ * with the net effect that all other cpus only have idle vcpu
+ * running. In this special case, we can avoid vcpu migration
+ * then and system can be considered in a stop state.
+ *
+ * So current cpu hotplug is a special version for PM specific
+ * usage, and need more effort later for full cpu hotplug.
+ * (ktian1)
+ */
+int cpu_down(unsigned int cpu)
+{
+       int err = 0;
+       cpumask_t mask;
+
+       spin_lock(&cpu_add_remove_lock);
+       if (num_online_cpus() == 1) {
+               err = -EBUSY;
+               goto out;
+       }
+
+       if (!cpu_online(cpu)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       printk("Prepare to bring CPU%d down...\n", cpu);
+       /* Send notification to remote idle vcpu */
+       cpus_clear(mask);
+       cpu_set(cpu, mask);
+       per_cpu(cpu_state, cpu) = CPU_DYING;
+       smp_send_event_check_mask(mask);
+
+       __cpu_die(cpu);
+
+       if (cpu_online(cpu)) {
+               printk("Bad state (DEAD, but in online map) on CPU%d\n",
cpu);
+               err = -EBUSY;
+       }
+out:
+       spin_unlock(&cpu_add_remove_lock);
+       return err;
+}
+
+int cpu_up(unsigned int cpu)
+{
+       int err = 0;
+
+       spin_lock(&cpu_add_remove_lock);
+       if (cpu_online(cpu)) {
+               printk("Bring up a online cpu. Bogus!\n");
+               err = -EBUSY;
+               goto out;
+       }
+
+       err = __cpu_up(cpu);
+       if (err < 0)
+               goto out;
+
+out:
+       spin_unlock(&cpu_add_remove_lock);
+       return err;
 }
 
 /* From kernel/power/main.c */
@@ -1308,6 +1384,22 @@ void __cpu_die(unsigned int cpu)
 
 int __devinit __cpu_up(unsigned int cpu)
 {
+#ifdef CONFIG_HOTPLUG_CPU
+       int ret=0;
+
+       /*
+        * We do warm boot only on cpus that had booted earlier
+        * Otherwise cold boot is all handled from smp_boot_cpus().
+        * cpu_callin_map is set during AP kickstart process. Its reset
+        * when a cpu is taken offline from cpu_exit_clear().
+        */
+       if (!cpu_isset(cpu, cpu_callin_map))
+               ret = __smp_prepare_cpu(cpu);
+
+       if (ret)
+               return -EIO;
+#endif
+
        /* In case one didn't come up */
        if (!cpu_isset(cpu, cpu_callin_map)) {
                printk(KERN_DEBUG "skipping cpu%d, didn't come
online\n", cpu);
diff -r f15338af3e2a xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/arch/x86/time.c       Mon Jun 18 03:36:33 2007 -0400
@@ -947,11 +947,15 @@ unsigned long get_localtime(struct domai
         + d->time_offset_seconds;
 }
 
-int time_suspend(void)
+void destroy_percpu_time(void)
 {
     /* Better to cancel calibration timer for accuracy. */
     kill_timer(&this_cpu(cpu_time).calibration_timer);
-
+}
+
+int time_suspend(void)
+{
+    destroy_percpu_time();
     return 0;
 }
 
diff -r f15338af3e2a xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/include/asm-x86/config.h      Mon Jun 18 03:27:04 2007 -0400
@@ -38,6 +38,9 @@
 #define CONFIG_ACPI_BOOT 1
 
 #define CONFIG_VGA 1
+
+#define CONFIG_HOTPLUG 1
+#define CONFIG_HOTPLUG_CPU 1
 
 #define HZ 100
 
diff -r f15338af3e2a xen/include/asm-x86/smp.h
--- a/xen/include/asm-x86/smp.h Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/include/asm-x86/smp.h Mon Jun 18 03:27:04 2007 -0400
@@ -45,14 +45,30 @@ extern void zap_low_mappings(l2_pgentry_
 extern void zap_low_mappings(l2_pgentry_t *base);
 #endif
 
+extern void lock_ipi_call_lock(void);
+extern void unlock_ipi_call_lock(void);
+
 #define MAX_APICID 256
 extern u8 x86_cpu_to_apicid[];
 
 #define cpu_physical_id(cpu)   x86_cpu_to_apicid[cpu]
 
+/* State of each CPU. */
+#define CPU_ONLINE     0x0002  /* CPU is up */
+#define CPU_DYING      0x0003  /* CPU is requested to die */
+#define CPU_DEAD       0x0004  /* CPU is dead */
+DECLARE_PER_CPU(int, cpu_state);
+
 #ifdef CONFIG_HOTPLUG_CPU
+#define cpu_is_offline(cpu) unlikely(per_cpu(cpu_state,cpu) ==
CPU_DYING)
+extern int cpu_down(unsigned int cpu);
+extern int cpu_up(unsigned int cpu);
 extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
+extern void disable_nonboot_cpus(void);
+extern void enable_nonboot_cpus(void);
+#else
+static inline int cpu_is_offline(int cpu) {return 0;}
 #endif
 
 /*
diff -r f15338af3e2a xen/include/asm-x86/system.h
--- a/xen/include/asm-x86/system.h      Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/include/asm-x86/system.h      Mon Jun 18 03:27:04 2007 -0400
@@ -313,6 +313,8 @@ static always_inline unsigned long long 
 #define __sti()                        __asm__ __volatile__("sti": :
:"memory")
 /* used in the idle loop; sti takes one instruction cycle to complete
*/
 #define safe_halt()            __asm__ __volatile__("sti; hlt": :
:"memory")
+/* used when interrupts are already enabled or to shutdown the
processor */
+#define halt()                 __asm__ __volatile__("hlt": : :"memory")
 
 /* For spinlocks etc */
 #if defined(__i386__)
diff -r f15338af3e2a xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h        Mon Jun 18 03:09:47 2007 -0400
+++ b/xen/include/asm-x86/time.h        Mon Jun 18 03:35:21 2007 -0400
@@ -25,5 +25,6 @@ extern int time_resume(void);
 extern int time_resume(void);
 
 extern void init_percpu_time(void);
+extern void destroy_percpu_time(void);
 
 #endif /* __X86_TIME_H__ */

Attachment: xen_cpu_hotplug_for_pm.patch
Description: xen_cpu_hotplug_for_pm.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 5/10] Add Xen CPU hotplug support, Tian, Kevin <=