WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] [RFC] pv guest numa [RE: Host Numa informtion in dom0]

To: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Subject: Re: [Xen-devel] [RFC] pv guest numa [RE: Host Numa informtion in dom0]
From: Dulloor <dulloor@xxxxxxxxx>
Date: Thu, 18 Feb 2010 11:24:19 -0500
Cc: Andre Przywara <andre.przywara@xxxxxxx>, Ian Pratt <Ian.Pratt@xxxxxxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>, Keir Fraser <keir.fraser@xxxxxxxxxxxxx>, "Nakajima, Jun" <jun.nakajima@xxxxxxxxx>
Delivery-date: Thu, 18 Feb 2010 08:25:15 -0800
Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:mime-version:received:in-reply-to:references :date:message-id:subject:from:to:cc:content-type :content-transfer-encoding; bh=+vr+e9FydGdQrB4p4JmDkAvjHyEdJuuet2xtypWs+bA=; b=IO70X7/Bdhry2C20s6VCMQkY8LVfWhFRjje1e4fqI8bT28YBt+v7ZRZKiguY+ltQ2x Q8oKaNBx0DOOtDaoZd00qVyKcnWDscbxEPfMUOH7X2VhlujtTnpk6AMeT0UBNCD4sio0 h3eArOhuG3QTHw81+KVE1DoTe7uOeu/pX8OzI=
Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=mime-version:in-reply-to:references:date:message-id:subject:from:to :cc:content-type:content-transfer-encoding; b=L8ywl5NTOhQy8jL4N+GKoKEEfXQ0MsjurylIRACbxSBogUmlOmZJNVIDIpBZXioBtK 05u+U+/T53afxFW5+8TQwX8rcsboZGOwwdZj8Qaxjnfo4K9JUJpgoVNo4CzEoXtgSVf2 KkgAAnV+Eb/8KUDgyOjnJLtsdZ10GknvgraI4=
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <20100216174900.GB21067@xxxxxxxxxxxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <940bcfd21002122225i2c79aad2q91ff6432faef3192@xxxxxxxxxxxxxx> <20100216174900.GB21067@xxxxxxxxxxxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Konrad, Thanks for the comments. I will make the changes and send over
the patch.

Any comments on the general approach are welcome too.

-dulloor

On Tue, Feb 16, 2010 at 12:49 PM, Konrad Rzeszutek Wilk
<konrad.wilk@xxxxxxxxxx> wrote:
> Run this patch through scripts/checkpatch.pl
>
> On Sat, Feb 13, 2010 at 01:25:22AM -0500, Dulloor wrote:
>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>> index 86bef0f..7a24070 100644
>> --- a/arch/x86/kernel/setup.c
>> +++ b/arch/x86/kernel/setup.c
>> @@ -940,7 +940,8 @@ void __init setup_arch(char **cmdline_p)
>>       /*
>>        * Parse SRAT to discover nodes.
>>        */
>> -     acpi_numa_init();
>> +    if (acpi_numa > 0)
>> +         acpi_numa_init();
>
> Why? Why can't we just try  acpi_numa_init()?
>
>>  #endif
>>
>>       initmem_init(0, max_pfn);
>> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
>> index 459913b..14fa654 100644
>> --- a/arch/x86/mm/numa_64.c
>> +++ b/arch/x86/mm/numa_64.c
>> @@ -11,7 +11,9 @@
>>  #include <linux/ctype.h>
>>  #include <linux/module.h>
>>  #include <linux/nodemask.h>
>> +#include <linux/cpumask.h>
>>  #include <linux/sched.h>
>> +#include <xen/interface/xen.h>
>>
>>  #include <asm/e820.h>
>>  #include <asm/proto.h>
>> @@ -19,6 +21,7 @@
>>  #include <asm/numa.h>
>>  #include <asm/acpi.h>
>>  #include <asm/k8.h>
>> +#include <asm/xen/hypervisor.h>
>
> If one does not set CONFIG_XEN does this compile?
>>
>>  struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
>>  EXPORT_SYMBOL(node_data);
>> @@ -428,7 +431,6 @@ static int __init numa_emulation(unsigned long 
>> start_pfn, unsigned long last_pfn
>>        */
>>       if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
>>               long n = simple_strtol(cmdline, NULL, 0);
>> -
>
> No need for this.
>>               num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
>>               if (num_nodes < 0)
>>                       return num_nodes;
>> @@ -522,16 +524,162 @@ out:
>>       numa_init_array();
>>       return 0;
>>  }
>> +struct xen_domain_numa_layout pv_numa_layout;
>> +
>> +void dump_numa_layout(struct xen_domain_numa_layout *layout)
>> +{
>> +    unsigned int i, j;
>> +    char vcpumask[128];
>> +    printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n",
>> +                            layout->max_vcpus, layout->max_vnodes);
>
> Redo the printk's. They need KERN_DEBUG
>> +    for (i = 0; i < layout->max_vnodes; i++)
>> +    {
>> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> +        cpumask_scnprintf(vcpumask, sizeof(vcpumask),
>> +                                ((cpumask_t *)&vnode_data->vcpu_mask));
>> +        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n",
>> +                vnode_data->vnode_id, vnode_data->mnode_id,
>> +                (unsigned long)vnode_data->nr_pages, vcpumask);
>
> This one too.
>> +    }
>> +
>> +    printk("vnode distances :\n");
>
> and
>> +    for (i = 0; i < layout->max_vnodes; i++)
>> +        printk("\tvnode[%u]", i);
>> +    for (i = 0; i < layout->max_vnodes; i++)
>> +    {
>> +        printk("\nvnode[%u]", i);
>
> this
>> +        for (j = 0; j < layout->max_vnodes; j++)
>> +            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + 
>> j]);
>> +        printk("\n");
> one to.
>> +    }
>> +    return;
>> +}
>> +
>> +static void __init xen_init_slit_table(struct xen_domain_numa_layout 
>> *layout)
>> +{
>> +    /* Construct a slit table (using layout->vnode_distance).
>> +     * Copy it to acpi_slit. */
>> +    return;
>> +}
>> +/* Distribute the vcpus over the vnodes according to their affinity */
>> +static void __init xen_init_numa_array(struct xen_domain_numa_layout 
>> *layout)
>> +{
>> +     int vcpu, vnode;
>> +
>> +     printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
>
> pr_debug instead?
>> +
>> +    for (vnode = 0; vnode < layout->max_vnodes; vnode++)
>> +    {
>> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
>> +        cpumask_t vcpu_mask = *((cpumask_t *)&vnode_data->vcpu_mask);
>> +
>> +        for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
>> +        {
>> +            if (cpu_isset(vcpu, vcpu_mask))
>> +            {
>> +                if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
>> +                {
>> +                    printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n",
>> +                                        vcpu, early_cpu_to_node(vcpu));
>> +                    continue;
>> +                }
>> +                printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
>> +                     numa_set_node(vcpu, vnode);
>> +            }
>> +        }
>> +    }
>> +    return;
>> +}
>> +
>> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
>> +                            unsigned long start_pfn, unsigned long last_pfn)
>> +{
>> +     int num_vnodes, i;
>> +    u64 node_start_addr, node_end_addr, max_addr;
>> +
>> +    printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
>> +                                        layout->max_vnodes, 
>> layout->max_vcpus);
>> +    dump_numa_layout(layout);
>> +     memset(&nodes, 0, sizeof(nodes));
>> +
>> +    num_vnodes = layout->max_vnodes;
>> +    BUG_ON(num_vnodes > MAX_NUMNODES);
>
> Hmm.. Is that really neccessary? What if we just do WARN("some lengthy 
> explanation"),
> and bail out and not initialize these structures?
>> +
>> +    max_addr = last_pfn << PAGE_SHIFT;
>> +
>> +    node_start_addr = start_pfn << PAGE_SHIFT;
>> +    for (i = 0; i < num_vnodes; i++)
>> +    {
>> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> +        u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
>> +
>> +             node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
>> +
>> +             if (i == (num_vnodes-1))
>> +                     node_end_addr = max_addr;
>> +             else
>> +        {
>> +            node_end_addr = node_start_addr + node_size;
>> +                     while ((node_end_addr - node_start_addr -
>> +                e820_hole_size(node_start_addr, node_end_addr)) < node_size)
>> +            {
>> +                node_end_addr += FAKE_NODE_MIN_SIZE;
>> +                             if (node_end_addr > max_addr) {
>> +                                     node_end_addr = max_addr;
>> +                                     break;
>> +                             }
>> +                     }
>> +        }
>> +        /* node_start_addr updated inside the function */
>> +        if (setup_node_range(i, nodes, &node_start_addr,
>> +                    (node_end_addr-node_start_addr), max_addr+1))
>> +            BUG();
>> +    }
>> +
>> +     printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
>
> Is this neccessary? Can be it be debug?
>> +
>> +    memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
>> +    if (memnode_shift < 0) {
>> +         printk(KERN_ERR "No NUMA hash function found.\n");
>> +        BUG();
>
> Wow. BUG? What about just bailing out and unset xen_pv_emulation flag?
>
>> +    }
>> +    /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! 
>> */
>> +     /*
>> +      * We need to vacate all active ranges that may have been registered by
>> +      * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
>> +      * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
>> +      */
>> +     remove_all_active_ranges();
>> +
>> +    BUG_ON(acpi_numa >= 0);
>> +     for_each_node_mask(i, node_possible_map) {
>> +             e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
>> +                                             nodes[i].end >> PAGE_SHIFT);
>> +             setup_node_bootmem(i, nodes[i].start, nodes[i].end);
>> +     }
>> +    xen_init_slit_table(layout);
>> +     xen_init_numa_array(layout);
>> +     return 0;
>> +}
>>  #endif /* CONFIG_NUMA_EMU */
>>
>>  void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
>>  {
>>       int i;
>> +    struct xen_domain_numa_layout *numa_layout = &pv_numa_layout;
>> +
>> +    int xen_pv_numa_enabled = numa_layout->max_vnodes;
>>
>>       nodes_clear(node_possible_map);
>>       nodes_clear(node_online_map);
>>
>>  #ifdef CONFIG_NUMA_EMU
>> +    if (xen_pv_domain() && xen_pv_numa_enabled)
>> +    {
>> +        if (!xen_numa_emulation(numa_layout, start_pfn, last_pfn))
>> +            return;
>> +    }
>> +
>>       if (cmdline && !numa_emulation(start_pfn, last_pfn))
>>               return;
>>       nodes_clear(node_possible_map);
>> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>> index ecb9b0d..b020555 100644
>> --- a/arch/x86/xen/enlighten.c
>> +++ b/arch/x86/xen/enlighten.c
>> @@ -83,6 +83,8 @@ void *xen_initial_gdt;
>>   */
>>  struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
>>
>> +struct xen_domain_numa_layout *HYPERVISOR_domain_numa_layout;
>> +
>>  /*
>>   * Flag to determine whether vcpu info placement is available on all
>>   * VCPUs.  We assume it is to start with, and then set it to zero on
>> @@ -1089,6 +1091,7 @@ static void __init xen_setup_stackprotector(void)
>>       pv_cpu_ops.load_gdt = xen_load_gdt;
>>  }
>>
>> +extern struct xen_domain_numa_layout pv_numa_layout;
>>  /* First C function to be called on Xen boot */
>>  asmlinkage void __init xen_start_kernel(void)
>>  {
>> @@ -1230,6 +1233,12 @@ asmlinkage void __init xen_start_kernel(void)
>>               xen_start_info->console.domU.evtchn = 0;
>>       }
>>
>> +    {
>> +        struct xen_domain_numa_layout *layout =
>> +            (void *)((char *)xen_start_info +
>> +                        xen_start_info->numa_layout_info.info_off);
>> +        memcpy(&pv_numa_layout, layout, sizeof(*layout));
>> +    }
>
> Shouldn't there be a check to see if the Xen actually exports
> this structure? Otherwise you might copy garbage in and set
> pv_numa_layout values to random values.
>
>>       xen_raw_console_write("about to get started...\n");
>>
>>       /* Start the world */
>> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
>> index 612f2c9..cb944a2 100644
>> --- a/arch/x86/xen/setup.c
>> +++ b/arch/x86/xen/setup.c
>> @@ -281,6 +281,9 @@ void __init xen_arch_setup(void)
>>               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
>>               disable_acpi();
>>       }
>> +
>> +    acpi_numa = -1;
>> +    numa_off = 1;
>>  #endif
>>
>>       /*
>> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
>> index 812ffd5..d4588fa 100644
>> --- a/include/xen/interface/xen.h
>> +++ b/include/xen/interface/xen.h
>> @@ -398,6 +398,53 @@ struct shared_info {
>>
>>  };
>>
>> +#define XEN_NR_CPUS 64
> No way to share this with some other variable? Like NR_CPUS?
>
>> +#if defined(__i386__)
>> +#define XEN_BITS_PER_LONG 32
>> +#define XEN_BYTES_PER_LONG 4
>
> There gotta be some of these defined in other headers.
>> +#define XEN_LONG_BYTEORDER 2
>> +#elif defined(__x86_64__)
>> +#define XEN_BITS_PER_LONG 64
>> +#define XEN_BYTES_PER_LONG 8
>> +#define XEN_LONG_BYTEORDER 3
>> +#endif
>> +
>> +/* same as cpumask_t - in xen and even Linux (for now) */
>> +#define XEN_BITS_TO_LONGS(bits) \
>> +    (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG)
>> +#define XEN_DECLARE_BITMAP(name,bits) \
>> +    unsigned long name[XEN_BITS_TO_LONGS(bits)]
>
> I am pretty sure there are macros for this laready.
>> +struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); };
>> +#ifndef __XEN__
>
> Is that neccessary? typedefs are frowned upon in kernel.
>> +typedef struct xen_cpumask xen_cpumask_t;
>> +#endif
>> +
>> +#define XEN_MAX_VNODES 8
>> +struct xen_vnode_data {
>> +    uint32_t vnode_id;
>> +    uint32_t mnode_id;
>> +    uint64_t nr_pages;
>> +    /* XXX: Can we use this in xen<->domain interfaces ? */
>> +    struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */
>> +};
>> +#ifndef __XEN__
>> +typedef struct xen_vnode_data xen_vnode_data_t;
>
> Ditto.
>> +#endif
>> +
>> +/* NUMA layout for the domain at the time of startup.
>> + * Structure has to fit within a page. */
>> +struct xen_domain_numa_layout {
>> +    uint32_t max_vcpus;
>> +    uint32_t max_vnodes;
>> +
>> +    /* Only (max_vnodes*max_vnodes) entries are filled */
>> +    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
>> +    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
>> +};
>> +#ifndef __XEN__
>> +typedef struct xen_domain_numa_layout xen_domain_numa_layout_t;
>
> Ditto.
>> +#endif
>> +
>>  /*
>>   * Start-of-day memory layout for the initial domain (DOM0):
>>   *  1. The domain is started within contiguous virtual-memory region.
>> @@ -449,6 +496,13 @@ struct start_info {
>>       unsigned long mod_start;    /* VIRTUAL address of pre-loaded module. 
>>  */
>>       unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     
>> */
>>       int8_t cmd_line[MAX_GUEST_CMDLINE];
>> +    /* The pfn range here covers both page table and p->m table frames.   */
>> +    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
>
> Why is this added in this patch? That doesn't look to be used by your
> patch.
>> +    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
>
> Ditto.
>> +    struct {
>> +        uint32_t info_off;  /* Offset of console_info struct.         *
>> +        uint32_t info_size; /* Size of console_info struct from start.*/
>
> Wrong comment.
>> +    } numa_layout_info;
>
>>  };
>>
>>  struct dom0_vga_console_info {
>
>

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>