[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3] Xen: Spread boot time page scrubbing across all available CPU's



>>> On 11.04.14 at 20:08, <konrad@xxxxxxxxxx> wrote:

(I'll try to not duplicate comments others already made.)

> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata = 1;
>  boolean_param("bootscrub", opt_bootscrub);
>  
>  /*
> + * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock 
> held
> + */
> +static unsigned int __initdata opt_bootscrub_chunk = 128 * 1024 * 1024;

MB(128). Also perhaps better "unsigned long" to allow the value to
be 4Gb or above? Or alternatively make this a page count instead
of a byte one.

> @@ -1256,45 +1272,166 @@ void __init end_boot_allocator(void)
>      printk("\n");
>  }
>  
> +void __init smp_scrub_heap_pages(void *data)

static

> +{
> +    unsigned long mfn, start, end;
> +    struct page_info *pg;
> +    struct scrub_region *r;
> +    unsigned int temp_cpu, node, cpu_idx = 0;
> +    unsigned int cpu = smp_processor_id();
> +
> +    if ( data )
> +        r = data;
> +    else {
> +        node = cpu_to_node(cpu);
> +        if ( node == NUMA_NO_NODE )
> +            return;
> +        r = &region[node];
> +    }
> +    ASSERT(r != NULL);
> +
> +    /* Determine the current CPU's index into CPU's linked to this node*/

There's not just a blank missing here, but also a stop (and similarly on
various other comments).

>  void __init scrub_heap_pages(void)
>  {
> -    unsigned long mfn;
> -    struct page_info *pg;
> +    cpumask_t node_cpus, temp_cpus, all_worker_cpus = {{ 0 }};

No open coding like this please. Either (preferred) use cpumask_clear()
or limit the initializer to {} (thus not depending on other than the fact
that this is a compound type).

> +    unsigned int i, j, cpu, sibling;
> +    unsigned long offset, max_per_cpu_sz = 0;
> +    unsigned long start, end;
> +    unsigned long rem = 0;
>  
>      if ( !opt_bootscrub )
>          return;
>  
> -    printk("Scrubbing Free RAM: ");
> +    /* Scrub block size */
> +    chunk_size = opt_bootscrub_chunk >> PAGE_SHIFT;
> +    if ( chunk_size == 0 )
> +        chunk_size = 1;

This seems awfully low - I'd suggest reverting to the default when the
passed value is zero.

> -    for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
> +    /* Round #0 - figure out amounts and which CPUs to use */
> +    for_each_online_node ( i )
>      {
> +        /* Calculate Node memory start and end address */
> +        start = max(node_start_pfn(i), first_valid_mfn);
> +        end = min(node_start_pfn(i) + node_spanned_pages(i), max_page);
> +        /* CPUs that are online and on this node (if none, that it is OK */

There's a closing parenthesis missing here.

> +        cpumask_and(&node_cpus, &node_to_cpumask(i), &cpu_online_map);
> +        cpumask_copy(&temp_cpus, &node_cpus);
> +        /* Rip out threads. */
> +        for_each_cpu ( j, &temp_cpus )
> +        {
> +            cpu = 0;
> +            for_each_cpu(sibling, per_cpu(cpu_sibling_mask, j)) {

We commonly treat these for_each_ constructs as extended
keywords, i.e. requiring blanks on both sides of the parentheses. I'd
be fine with either style, as long as you us it consistently. But across
the last dozen of lines you have three different variants.

> +                if (cpu++ == 0) /* Skip 1st CPU - the core */

Coding style.

> +                    continue;
> +                cpumask_clear_cpu(sibling, &node_cpus);

I think this can be done without the inner loop:

cpumask_clear(temp_cpus);
for_each_cpu(j, node_cpus) {
    if(cpumask_intersects(temp_cpus, per_cpu(cpu_sibling_mask, j)))
        continue;
    cpu = cpumask_any(per_cpu(cpu_sibling_mask, j));
    cpumask_set_cpu(cpu, temp_cpus);
}

(Of course you may prefer cpumask_first() over cpumask_any().)

> +            }
> +        }
> +        cpumask_or(&all_worker_cpus, &all_worker_cpus, &node_cpus);
> +        if ( cpumask_empty(&node_cpus) ) { /* No CPUs on this node. */
> +            rem = 0;
> +            region[i].per_cpu_sz = (end - start);
> +        } else {
> +            rem = (end - start) % cpumask_weight(&node_cpus);
> +            region[i].per_cpu_sz = (end - start) / 
> cpumask_weight(&node_cpus);
> +            if ( region[i].per_cpu_sz > max_per_cpu_sz )
> +                max_per_cpu_sz = region[i].per_cpu_sz;
> +        }
> +        region[i].start = start;
> +        region[i].rem = rem;
> +        cpumask_copy(&region[i].cpu, &node_cpus);
> +
> +    }
> +    cpu = smp_processor_id();
> +    /* Round default chunk size down if required */
> +    if ( max_per_cpu_sz && chunk_size > max_per_cpu_sz )
> +        chunk_size = max_per_cpu_sz;
> +
> +    printk("Scrubbing Free RAM on %u nodes using %u CPUs: ", 
> num_online_nodes(),
> +           cpumask_weight(&all_worker_cpus));
> +
> +    /* Round: #1 - do NUMA nodes with CPUs */
> +    for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
> +    {
> +        for_each_online_node ( i )
> +            region[i].offset = offset;
> +
>          process_pending_softirqs();
>  
> -        pg = mfn_to_page(mfn);
> +        spin_lock(&heap_lock);
> +        on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, NULL, 1);
> +        spin_unlock(&heap_lock);
>  
> -        /* Quick lock-free check. */
> -        if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
> +        printk(".");
> +    }
> +
> +    /* Round #2: NUMA nodes with no CPUs get scrubbed with all CPUs. */
> +    for_each_online_node ( i )
> +    {
> +        node_cpus = node_to_cpumask(i);
> +
> +        if ( !cpumask_empty(&node_cpus) )
>              continue;
>  
> -        /* Every 100MB, print a progress dot. */
> -        if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
> -            printk(".");
> +        /* We already have the node information from round #0 */
> +        end = region[i].start + region[i].per_cpu_sz;

The value calculated here doesn't seem to be used anywhere.

> +        rem = region[i].per_cpu_sz % cpumask_weight(&all_worker_cpus);
>  
> -        spin_lock(&heap_lock);
> +        region[i].rem = rem;
> +        region[i].per_cpu_sz /= cpumask_weight(&all_worker_cpus);
> +        max_per_cpu_sz = region[i].per_cpu_sz;
> +        if ( max_per_cpu_sz && chunk_size > max_per_cpu_sz )
> +            chunk_size = max_per_cpu_sz;

So you may end up limiting chunk size further on each iteration.
That's surely not very efficient.

> +        cpumask_copy(&region[i].cpu, &all_worker_cpus);
>  
> -        /* Re-check page status with lock held. */
> -        if ( page_state_is(pg, free) )
> -            scrub_one_page(pg);
> +        for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
> +        {
> +            region[i].offset = offset;
>  
> -        spin_unlock(&heap_lock);
> -    }
> +            process_pending_softirqs();
> +
> +            spin_lock(&heap_lock);
> +            on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, 
> &region[i], 1);

So in round 2 you're telling all CPUs to scrub one node's memory. That
ought to be particularly inefficient when you have relatively many
CPUs and relatively little memory on the node. In the hope that nodes
without CPUs aren't that common (or are relatively symmetric in terms
of their count relative to the count of nodes with CPUs), wouldn't it be
better (and yielding simpler code) to have each CPU scrub one such
node in its entirety (ultimately, if this turns out to be a more common
case, node distance could even be taken into consideration when
picking which CPU does what).

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.