With the larger default NR_CPUS config setting (and the more with build time settings exceeding this default) the wasting of memory (and potentially other resources) just because cpu_possible_map doesn't get set up properly increases. Use Linux' (accordingly modified to fit Xen) prefill_possible_map() to overcome this. This makes necessary an adjustment to tasklet initialization (which must not happen before cpu_possible_map is guaranteed to be fully set up - according to my static code analysis this was a problem on ia64 anyway). Tracing code also needed a minor adjustment, as it still had a simple counted loop accessing per-CPU data in its body. Signed-off-by: Jan Beulich --- 2010-05-04.orig/xen/arch/ia64/linux-xen/smpboot.c 2010-05-04 16:04:09.000000000 +0200 +++ 2010-05-04/xen/arch/ia64/linux-xen/smpboot.c 2010-05-04 13:22:06.000000000 +0200 @@ -776,7 +776,7 @@ void __cpu_die(unsigned int cpu) #endif /* CONFIG_HOTPLUG_CPU */ void -smp_cpus_done (unsigned int dummy) +smp_cpus_done(void) { int cpu; unsigned long bogosum = 0; --- 2010-05-04.orig/xen/arch/ia64/xen/xensetup.c 2010-05-04 16:04:09.000000000 +0200 +++ 2010-05-04/xen/arch/ia64/xen/xensetup.c 2010-05-04 16:42:36.000000000 +0200 @@ -562,10 +562,12 @@ skip_move: end_boot_allocator(); softirq_init(); - tasklet_subsys_init(); + tasklet_early_init(); late_setup_arch(&cmdline); + tasklet_subsys_init(); + scheduler_init(); idle_vcpu[0] = (struct vcpu*) ia64_r13; idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0); @@ -626,7 +628,7 @@ printk("num_online_cpus=%d, max_cpus=%d\ local_irq_disable(); printk("Brought up %ld CPUs\n", (long)num_online_cpus()); - smp_cpus_done(max_cpus); + smp_cpus_done(); #endif initialise_gdb(); /* could be moved earlier */ --- 2010-05-04.orig/xen/arch/x86/mpparse.c 2010-05-04 16:04:09.000000000 +0200 +++ 2010-05-04/xen/arch/x86/mpparse.c 2010-05-04 13:22:06.000000000 +0200 @@ -35,7 +35,6 @@ /* Have we found an MP table */ int smp_found_config; -unsigned int __devinitdata maxcpus = NR_CPUS; /* * Various Linux-internal data structures created from the @@ -66,7 +65,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -static unsigned int __devinitdata num_processors; +unsigned int __devinitdata num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -105,8 +104,10 @@ static int __devinit MP_processor_info ( int ver, apicid, cpu = 0; physid_mask_t phys_cpu; - if (!(m->mpc_cpuflag & CPU_ENABLED)) + if (!(m->mpc_cpuflag & CPU_ENABLED)) { + ++disabled_cpus; return -EINVAL; + } apicid = mpc_apic_id(m, translation_table[mpc_record]); @@ -185,9 +186,9 @@ static int __devinit MP_processor_info ( return -ENOSPC; } - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); + if (max_cpus && num_processors >= max_cpus) { + printk(KERN_WARNING "WARNING: maxcpus limit of %u reached." + " Processor ignored.\n", max_cpus); return -ENOSPC; } --- 2010-05-04.orig/xen/arch/x86/setup.c 2010-05-04 16:04:09.000000000 +0200 +++ 2010-05-04/xen/arch/x86/setup.c 2010-05-04 16:43:22.000000000 +0200 @@ -61,7 +61,7 @@ static int __initdata opt_nosmp = 0; boolean_param("nosmp", opt_nosmp); /* maxcpus: maximum number of CPUs to activate. */ -static unsigned int __initdata max_cpus = NR_CPUS; +unsigned int __devinitdata max_cpus; integer_param("maxcpus", max_cpus); /* opt_watchdog: If true, run a watchdog NMI on each processor. */ @@ -568,6 +568,11 @@ void __init __start_xen(unsigned long mb if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 ) EARLY_FAIL("Misaligned CPU0 stack.\n"); + if ( opt_nosmp ) + max_cpus = prefill_possible_map(1); + else if ( max_cpus ) + max_cpus = prefill_possible_map(max_cpus); + if ( e820_raw_nr != 0 ) { memmap_type = "Xen-e820"; @@ -978,7 +983,7 @@ void __init __start_xen(unsigned long mb #endif softirq_init(); - tasklet_subsys_init(); + tasklet_early_init(); early_cpu_init(); @@ -1017,6 +1022,11 @@ void __init __start_xen(unsigned long mb zap_low_mappings(); #endif + if ( !max_cpus ) + max_cpus = prefill_possible_map(0); + + tasklet_subsys_init(); + init_apic_mappings(); percpu_free_unused_areas(); @@ -1049,12 +1059,9 @@ void __init __start_xen(unsigned long mb vesa_mtrr_init(); #endif - if ( opt_nosmp ) - max_cpus = 0; - iommu_setup(); /* setup iommu if available */ - smp_prepare_cpus(max_cpus); + smp_prepare_cpus(!opt_nosmp * max_cpus); spin_debug_enable(); @@ -1087,7 +1094,7 @@ void __init __start_xen(unsigned long mb } printk("Brought up %ld CPUs\n", (long)num_online_cpus()); - smp_cpus_done(max_cpus); + smp_cpus_done(); initialise_gdb(); /* could be moved earlier */ --- 2010-05-04.orig/xen/arch/x86/smpboot.c 2010-05-04 16:04:09.000000000 +0200 +++ 2010-05-04/xen/arch/x86/smpboot.c 2010-05-04 13:22:06.000000000 +0200 @@ -83,10 +83,12 @@ EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; EXPORT_SYMBOL(cpu_callout_map); -cpumask_t cpu_possible_map = CPU_MASK_ALL; +cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; +unsigned int __devinitdata disabled_cpus; + /* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there * is no way to resync one AP against BP. TBD: for prescott and above, we * should use IA64's algorithm @@ -829,7 +831,11 @@ int alloc_cpu_id(void) { cpumask_t tmp_map; int cpu; - cpus_complement(tmp_map, cpu_present_map); + + if (max_cpus) + cpus_andnot(tmp_map, cpu_possible_map, cpu_present_map); + else + cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); if (cpu >= NR_CPUS) return -ENODEV; @@ -1243,6 +1249,52 @@ void __devinit smp_prepare_boot_cpu(void per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; } +/* + * cpu_possible_mask should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_mask on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - The user can overwrite it with max_cpus=NUM + * - Otherwise don't reserve additional CPUs. + * We do this because additional CPUs waste a lot of memory. + * -AK + */ +unsigned int __init prefill_possible_map(unsigned int max_cpus) +{ + unsigned int i, possible; + + /* no processor from mptable or madt */ + if (!num_processors) + num_processors = 1; + + if (!max_cpus) + possible = num_processors + disabled_cpus; + else + possible = max_cpus; + + if (possible > NR_CPUS) { + printk(KERN_WARNING + "%u processors exceeds NR_CPUS limit of %d\n", + possible, NR_CPUS); + possible = NR_CPUS; + } + + printk(KERN_INFO "SMP: Allowing %u CPUs, %d hotplug CPUs\n", + possible, max_t(int, possible - num_processors, 0)); + + for (i = 0; i < possible; i++) + cpu_set(i, cpu_possible_map); + + return possible; +} + static void remove_siblinginfo(int cpu) { @@ -1568,7 +1620,7 @@ int __devinit __cpu_up(unsigned int cpu) } -void __init smp_cpus_done(unsigned int max_cpus) +void __init smp_cpus_done(void) { #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); --- 2010-05-04.orig/xen/common/tasklet.c 2010-04-22 14:43:25.000000000 +0200 +++ 2010-05-04/xen/common/tasklet.c 2010-05-04 16:46:03.000000000 +0200 @@ -18,7 +18,7 @@ #include /* Some subsystems call into us before we are initialised. We ignore them. */ -static bool_t tasklets_initialised; +static unsigned int __read_mostly tasklets_initialised = UINT_MAX; /* * NB. Any modification to a tasklet_list requires the scheduler to run @@ -35,7 +35,8 @@ void tasklet_schedule_on_cpu(struct task spin_lock_irqsave(&tasklet_lock, flags); - if ( tasklets_initialised && !t->is_dead ) + if ( (tasklets_initialised == NR_CPUS || tasklets_initialised == cpu) && + !t->is_dead ) { t->scheduled_on = cpu; if ( !t->is_running ) @@ -161,14 +162,24 @@ void tasklet_init( t->data = data; } +void __init tasklet_early_init(void) +{ + unsigned int cpu = smp_processor_id(); + + INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu)); + + tasklets_initialised = cpu; +} + void __init tasklet_subsys_init(void) { unsigned int cpu; for_each_possible_cpu ( cpu ) - INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu)); + if ( cpu != tasklets_initialised ) + INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu)); - tasklets_initialised = 1; + tasklets_initialised = NR_CPUS; } /* --- 2010-05-04.orig/xen/common/trace.c 2010-04-22 14:43:25.000000000 +0200 +++ 2010-05-04/xen/common/trace.c 2010-05-04 16:26:50.000000000 +0200 @@ -289,7 +289,7 @@ void __init init_trace_bufs(void) return; } - for(i = 0; i < NR_CPUS; i++) + for_each_possible_cpu(i) spin_lock_init(&per_cpu(t_lock, i)); for(i=0; i