Re: [Xen-devel] [RFC] pv guest numa [RE: Host Numa informtion in

Konrad, Thanks for the comments. I will make the changes and send over
the patch.

Any comments on the general approach are welcome too.

-dulloor

On Tue, Feb 16, 2010 at 12:49 PM, Konrad Rzeszutek Wilk
<konrad.wilk@xxxxxxxxxx> wrote:
> Run this patch through scripts/checkpatch.pl
>
> On Sat, Feb 13, 2010 at 01:25:22AM -0500, Dulloor wrote:
>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>> index 86bef0f..7a24070 100644
>> --- a/arch/x86/kernel/setup.c
>> +++ b/arch/x86/kernel/setup.c
>> @@ -940,7 +940,8 @@ void __init setup_arch(char **cmdline_p)
>>       /*
>>        * Parse SRAT to discover nodes.
>>        */
>> -     acpi_numa_init();
>> +    if (acpi_numa > 0)
>> +         acpi_numa_init();
>
> Why? Why can't we just try  acpi_numa_init()?
>
>>  #endif
>>
>>       initmem_init(0, max_pfn);
>> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
>> index 459913b..14fa654 100644
>> --- a/arch/x86/mm/numa_64.c
>> +++ b/arch/x86/mm/numa_64.c
>> @@ -11,7 +11,9 @@
>>  #include <linux/ctype.h>
>>  #include <linux/module.h>
>>  #include <linux/nodemask.h>
>> +#include <linux/cpumask.h>
>>  #include <linux/sched.h>
>> +#include <xen/interface/xen.h>
>>
>>  #include <asm/e820.h>
>>  #include <asm/proto.h>
>> @@ -19,6 +21,7 @@
>>  #include <asm/numa.h>
>>  #include <asm/acpi.h>
>>  #include <asm/k8.h>
>> +#include <asm/xen/hypervisor.h>
>
> If one does not set CONFIG_XEN does this compile?
>>
>>  struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
>>  EXPORT_SYMBOL(node_data);
>> @@ -428,7 +431,6 @@ static int __init numa_emulation(unsigned long 
>> start_pfn, unsigned long last_pfn
>>        */
>>       if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
>>               long n = simple_strtol(cmdline, NULL, 0);
>> -
>
> No need for this.
>>               num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
>>               if (num_nodes < 0)
>>                       return num_nodes;
>> @@ -522,16 +524,162 @@ out:
>>       numa_init_array();
>>       return 0;
>>  }
>> +struct xen_domain_numa_layout pv_numa_layout;
>> +
>> +void dump_numa_layout(struct xen_domain_numa_layout *layout)
>> +{
>> +    unsigned int i, j;
>> +    char vcpumask[128];
>> +    printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n",
>> +                            layout->max_vcpus, layout->max_vnodes);
>
> Redo the printk's. They need KERN_DEBUG
>> +    for (i = 0; i < layout->max_vnodes; i++)
>> +    {
>> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> +        cpumask_scnprintf(vcpumask, sizeof(vcpumask),
>> +                                ((cpumask_t *)&vnode_data->vcpu_mask));
>> +        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n",
>> +                vnode_data->vnode_id, vnode_data->mnode_id,
>> +                (unsigned long)vnode_data->nr_pages, vcpumask);
>
> This one too.
>> +    }
>> +
>> +    printk("vnode distances :\n");
>
> and
>> +    for (i = 0; i < layout->max_vnodes; i++)
>> +        printk("\tvnode[%u]", i);
>> +    for (i = 0; i < layout->max_vnodes; i++)
>> +    {
>> +        printk("\nvnode[%u]", i);
>
> this
>> +        for (j = 0; j < layout->max_vnodes; j++)
>> +            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + 
>> j]);
>> +        printk("\n");
> one to.
>> +    }
>> +    return;
>> +}
>> +
>> +static void __init xen_init_slit_table(struct xen_domain_numa_layout 
>> *layout)
>> +{
>> +    /* Construct a slit table (using layout->vnode_distance).
>> +     * Copy it to acpi_slit. */
>> +    return;
>> +}
>> +/* Distribute the vcpus over the vnodes according to their affinity */
>> +static void __init xen_init_numa_array(struct xen_domain_numa_layout 
>> *layout)
>> +{
>> +     int vcpu, vnode;
>> +
>> +     printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
>
> pr_debug instead?
>> +
>> +    for (vnode = 0; vnode < layout->max_vnodes; vnode++)
>> +    {
>> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
>> +        cpumask_t vcpu_mask = *((cpumask_t *)&vnode_data->vcpu_mask);
>> +
>> +        for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
>> +        {
>> +            if (cpu_isset(vcpu, vcpu_mask))
>> +            {
>> +                if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
>> +                {
>> +                    printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n",
>> +                                        vcpu, early_cpu_to_node(vcpu));
>> +                    continue;
>> +                }
>> +                printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
>> +                     numa_set_node(vcpu, vnode);
>> +            }
>> +        }
>> +    }
>> +    return;
>> +}
>> +
>> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
>> +                            unsigned long start_pfn, unsigned long last_pfn)
>> +{
>> +     int num_vnodes, i;
>> +    u64 node_start_addr, node_end_addr, max_addr;
>> +
>> +    printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
>> +                                        layout->max_vnodes, 
>> layout->max_vcpus);
>> +    dump_numa_layout(layout);
>> +     memset(&nodes, 0, sizeof(nodes));
>> +
>> +    num_vnodes = layout->max_vnodes;
>> +    BUG_ON(num_vnodes > MAX_NUMNODES);
>
> Hmm.. Is that really neccessary? What if we just do WARN("some lengthy 
> explanation"),
> and bail out and not initialize these structures?
>> +
>> +    max_addr = last_pfn << PAGE_SHIFT;
>> +
>> +    node_start_addr = start_pfn << PAGE_SHIFT;
>> +    for (i = 0; i < num_vnodes; i++)
>> +    {
>> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> +        u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
>> +
>> +             node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
>> +
>> +             if (i == (num_vnodes-1))
>> +                     node_end_addr = max_addr;
>> +             else
>> +        {
>> +            node_end_addr = node_start_addr + node_size;
>> +                     while ((node_end_addr - node_start_addr -
>> +                e820_hole_size(node_start_addr, node_end_addr)) < node_size)
>> +            {
>> +                node_end_addr += FAKE_NODE_MIN_SIZE;
>> +                             if (node_end_addr > max_addr) {
>> +                                     node_end_addr = max_addr;
>> +                                     break;
>> +                             }
>> +                     }
>> +        }
>> +        /* node_start_addr updated inside the function */
>> +        if (setup_node_range(i, nodes, &node_start_addr,
>> +                    (node_end_addr-node_start_addr), max_addr+1))
>> +            BUG();
>> +    }
>> +
>> +     printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
>
> Is this neccessary? Can be it be debug?
>> +
>> +    memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
>> +    if (memnode_shift < 0) {
>> +         printk(KERN_ERR "No NUMA hash function found.\n");
>> +        BUG();
>
> Wow. BUG? What about just bailing out and unset xen_pv_emulation flag?
>
>> +    }
>> +    /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! 
>> */
>> +     /*
>> +      * We need to vacate all active ranges that may have been registered by
>> +      * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
>> +      * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
>> +      */
>> +     remove_all_active_ranges();
>> +
>> +    BUG_ON(acpi_numa >= 0);
>> +     for_each_node_mask(i, node_possible_map) {
>> +             e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
>> +                                             nodes[i].end >> PAGE_SHIFT);
>> +             setup_node_bootmem(i, nodes[i].start, nodes[i].end);
>> +     }
>> +    xen_init_slit_table(layout);
>> +     xen_init_numa_array(layout);
>> +     return 0;
>> +}
>>  #endif /* CONFIG_NUMA_EMU */
>>
>>  void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
>>  {
>>       int i;
>> +    struct xen_domain_numa_layout *numa_layout = &pv_numa_layout;
>> +
>> +    int xen_pv_numa_enabled = numa_layout->max_vnodes;
>>
>>       nodes_clear(node_possible_map);
>>       nodes_clear(node_online_map);
>>
>>  #ifdef CONFIG_NUMA_EMU
>> +    if (xen_pv_domain() && xen_pv_numa_enabled)
>> +    {
>> +        if (!xen_numa_emulation(numa_layout, start_pfn, last_pfn))
>> +            return;
>> +    }
>> +
>>       if (cmdline && !numa_emulation(start_pfn, last_pfn))
>>               return;
>>       nodes_clear(node_possible_map);
>> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>> index ecb9b0d..b020555 100644
>> --- a/arch/x86/xen/enlighten.c
>> +++ b/arch/x86/xen/enlighten.c
>> @@ -83,6 +83,8 @@ void *xen_initial_gdt;
>>   */
>>  struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
>>
>> +struct xen_domain_numa_layout *HYPERVISOR_domain_numa_layout;
>> +
>>  /*
>>   * Flag to determine whether vcpu info placement is available on all
>>   * VCPUs.  We assume it is to start with, and then set it to zero on
>> @@ -1089,6 +1091,7 @@ static void __init xen_setup_stackprotector(void)
>>       pv_cpu_ops.load_gdt = xen_load_gdt;
>>  }
>>
>> +extern struct xen_domain_numa_layout pv_numa_layout;
>>  /* First C function to be called on Xen boot */
>>  asmlinkage void __init xen_start_kernel(void)
>>  {
>> @@ -1230,6 +1233,12 @@ asmlinkage void __init xen_start_kernel(void)
>>               xen_start_info->console.domU.evtchn = 0;
>>       }
>>
>> +    {
>> +        struct xen_domain_numa_layout *layout =
>> +            (void *)((char *)xen_start_info +
>> +                        xen_start_info->numa_layout_info.info_off);
>> +        memcpy(&pv_numa_layout, layout, sizeof(*layout));
>> +    }
>
> Shouldn't there be a check to see if the Xen actually exports
> this structure? Otherwise you might copy garbage in and set
> pv_numa_layout values to random values.
>
>>       xen_raw_console_write("about to get started...\n");
>>
>>       /* Start the world */
>> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
>> index 612f2c9..cb944a2 100644
>> --- a/arch/x86/xen/setup.c
>> +++ b/arch/x86/xen/setup.c
>> @@ -281,6 +281,9 @@ void __init xen_arch_setup(void)
>>               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
>>               disable_acpi();
>>       }
>> +
>> +    acpi_numa = -1;
>> +    numa_off = 1;
>>  #endif
>>
>>       /*
>> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
>> index 812ffd5..d4588fa 100644
>> --- a/include/xen/interface/xen.h
>> +++ b/include/xen/interface/xen.h
>> @@ -398,6 +398,53 @@ struct shared_info {
>>
>>  };
>>
>> +#define XEN_NR_CPUS 64
> No way to share this with some other variable? Like NR_CPUS?
>
>> +#if defined(__i386__)
>> +#define XEN_BITS_PER_LONG 32
>> +#define XEN_BYTES_PER_LONG 4
>
> There gotta be some of these defined in other headers.
>> +#define XEN_LONG_BYTEORDER 2
>> +#elif defined(__x86_64__)
>> +#define XEN_BITS_PER_LONG 64
>> +#define XEN_BYTES_PER_LONG 8
>> +#define XEN_LONG_BYTEORDER 3
>> +#endif
>> +
>> +/* same as cpumask_t - in xen and even Linux (for now) */
>> +#define XEN_BITS_TO_LONGS(bits) \
>> +    (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG)
>> +#define XEN_DECLARE_BITMAP(name,bits) \
>> +    unsigned long name[XEN_BITS_TO_LONGS(bits)]
>
> I am pretty sure there are macros for this laready.
>> +struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); };
>> +#ifndef __XEN__
>
> Is that neccessary? typedefs are frowned upon in kernel.
>> +typedef struct xen_cpumask xen_cpumask_t;
>> +#endif
>> +
>> +#define XEN_MAX_VNODES 8
>> +struct xen_vnode_data {
>> +    uint32_t vnode_id;
>> +    uint32_t mnode_id;
>> +    uint64_t nr_pages;
>> +    /* XXX: Can we use this in xen<->domain interfaces ? */
>> +    struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */
>> +};
>> +#ifndef __XEN__
>> +typedef struct xen_vnode_data xen_vnode_data_t;
>
> Ditto.
>> +#endif
>> +
>> +/* NUMA layout for the domain at the time of startup.
>> + * Structure has to fit within a page. */
>> +struct xen_domain_numa_layout {
>> +    uint32_t max_vcpus;
>> +    uint32_t max_vnodes;
>> +
>> +    /* Only (max_vnodes*max_vnodes) entries are filled */
>> +    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
>> +    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
>> +};
>> +#ifndef __XEN__
>> +typedef struct xen_domain_numa_layout xen_domain_numa_layout_t;
>
> Ditto.
>> +#endif
>> +
>>  /*
>>   * Start-of-day memory layout for the initial domain (DOM0):
>>   *  1. The domain is started within contiguous virtual-memory region.
>> @@ -449,6 +496,13 @@ struct start_info {
>>       unsigned long mod_start;    /* VIRTUAL address of pre-loaded module. 
>>  */
>>       unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     
>> */
>>       int8_t cmd_line[MAX_GUEST_CMDLINE];
>> +    /* The pfn range here covers both page table and p->m table frames.   */
>> +    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
>
> Why is this added in this patch? That doesn't look to be used by your
> patch.
>> +    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
>
> Ditto.
>> +    struct {
>> +        uint32_t info_off;  /* Offset of console_info struct.         *
>> +        uint32_t info_size; /* Size of console_info struct from start.*/
>
> Wrong comment.
>> +    } numa_layout_info;
>
>>  };
>>
>>  struct dom0_vga_console_info {
>
>

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

Re: [Xen-devel] [RFC] pv guest numa [RE: Host Numa informtion in dom0]