diff -r c636287eab3c -r d6b7abf0d2d5 tools/libxc/Makefile
--- a/tools/libxc/Makefile	Tue Mar 30 21:18:25 2010 -0400
+++ b/tools/libxc/Makefile	Wed Mar 31 19:39:58 2010 -0400
@@ -25,6 +25,7 @@
 CTRL_SRCS-y       += xc_mem_event.c
 CTRL_SRCS-y       += xc_mem_paging.c
 CTRL_SRCS-y       += xc_memshr.c
+CTRL_SRCS-y       += xc_dom_numa.c
 CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c
 CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c
 CTRL_SRCS-$(CONFIG_SunOS) += xc_solaris.c
diff -r c636287eab3c -r d6b7abf0d2d5 tools/libxc/xc_dom.h
--- a/tools/libxc/xc_dom.h	Tue Mar 30 21:18:25 2010 -0400
+++ b/tools/libxc/xc_dom.h	Wed Mar 31 19:39:58 2010 -0400
@@ -1,4 +1,5 @@
 #include <xen/libelf/libelf.h>
+#include <xen/memory.h>
 
 #define INVALID_P2M_ENTRY   ((xen_pfn_t)-1)
 
@@ -102,6 +103,7 @@
     int xen_version;
     xen_capabilities_info_t xen_caps;
 
+    struct xen_domain_numa_layout *numa_layout;
     /* kernel loader, arch hooks */
     struct xc_dom_loader *kernel_loader;
     void *private_loader;
diff -r c636287eab3c -r d6b7abf0d2d5 tools/libxc/xc_dom_core.c
--- a/tools/libxc/xc_dom_core.c	Tue Mar 30 21:18:25 2010 -0400
+++ b/tools/libxc/xc_dom_core.c	Wed Mar 31 19:39:58 2010 -0400
@@ -19,6 +19,7 @@
 
 #include "xg_private.h"
 #include "xc_dom.h"
+#include "xc_dom_numa.h"
 
 /* ------------------------------------------------------------------------ */
 /* debugging                                                                */
@@ -678,8 +679,13 @@
                   __FUNCTION__, mem_mb, nr_pages, 1 << (page_shift-10));
     dom->total_pages = nr_pages;
 
-    xc_dom_printf("%s: 0x%" PRIpfn " pages\n",
-                  __FUNCTION__, dom->total_pages);
+    xc_dom_printf("%s: 0x%" PRIpfn " pages\n", __FUNCTION__, dom->total_pages);
+
+    if (xc_setup_domain_numa_layout(dom))
+    {
+        /* Ignore the error and proceed as non-numa guest */
+        xc_dom_printf("%s: xc_setup_domain_layout failed\n", __FUNCTION__);
+    }
 
     return 0;
 }
diff -r c636287eab3c -r d6b7abf0d2d5 tools/libxc/xc_dom_numa.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_numa.c	Wed Mar 31 19:39:58 2010 -0400
@@ -0,0 +1,512 @@
+/* XEN Guest NUMA (memory placement) 
+ * Author : Dulloor (dulloor@xxxxxxxxxx) */
+
+#include <string.h>
+#include "xg_private.h"
+#include "xc_dom.h"
+#include "xc_dom_numa.h"
+
+#define KB (1024)
+#define MB (KB * KB)
+
+#define pfn_to_paddr(pfn) ((xen_paddr_t)(pfn) << PAGE_SHIFT_X86)
+
+/* numa layout structures */
+xc_machine_numa_layout_t phys_numa_layout;
+xc_domain_numa_layout_t pv_numa_layout;
+
+void dump_guest_numa_layout(struct xc_dom_image *dom,
+                        xc_domain_numa_layout_t *layout)
+{
+    unsigned int i, j;
+    char vcpumask[128] = "";
+    xc_dom_printf("%s called :\n NUMA-LAYOUT(Dom%d) : vcpus(%u), vnodes(%u)",
+        __FUNCTION__, dom->guest_domid, layout->max_vcpus, layout->max_vnodes);
+    switch (layout->type)
+    {
+        case XEN_DOM_NUMA_CONFINED:
+            xc_dom_printf(", type(CONFINED)\n");
+            break;
+        case XEN_DOM_NUMA_SPLIT:
+            xc_dom_printf(", type(SPLIT)\n");
+            break;
+        case XEN_DOM_NUMA_STRIPED:
+            xc_dom_printf(", type(STRIPED)\n");
+            break;
+        default:
+            xc_dom_printf(", type(UNDEFINED)\n");
+    }
+    for (i = 0; i < layout->max_vnodes; i++)
+    {
+        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
+#if 0
+        xc_cpumask_scnprintf(vcpumask, sizeof(vcpumask), 
+                                *((cpumask_t *)&vnode_data->vcpu_mask));
+#endif
+        xc_dom_printf("vnode[%u]:mnode(%u), node_nr_pages(%lx), \
+                vcpu_mask(%s)\n", vnode_data->vnode_id, vnode_data->mnode_id,
+                vnode_data->nr_pages, vcpumask);
+    }
+
+    xc_dom_printf("vnode distances :\n");
+    for (i = 0; i < layout->max_vnodes; i++)
+        xc_dom_printf("\tvnode[%u]", i);
+    for (i = 0; i < layout->max_vnodes; i++)
+    {
+        xc_dom_printf("\nvnode[%u]", i);
+        for (j = 0; j < layout->max_vnodes; j++)
+            xc_dom_printf("\t%u",
+                            layout->vnode_distance[i*layout->max_vnodes + j]);
+        xc_dom_printf("\n");
+    }
+    return;
+}
+
+static int
+xc_read_machine_numa_layout(int xc_handle, xc_machine_numa_layout_t *layout)
+{
+    int rc, i;
+    struct xenmem_numa_op memop;
+
+    xc_dom_printf("%s: called\n", __FUNCTION__);
+
+    memset(layout, 0, sizeof(*layout));
+
+    memop.cmd = XENMEM_machine_numa_layout;
+    memop.u.minfo.max_nodes = XC_MAX_NODES;
+    set_xen_guest_handle(memop.u.minfo.node_distance, layout->node_distance);
+    set_xen_guest_handle(memop.u.minfo.node_data, layout->node_data);
+
+    rc = 0;
+    if (lock_pages(&memop, sizeof(struct xenmem_numa_op)) ||
+                                    lock_pages(layout, sizeof(*layout)))
+    {
+        rc = -1;
+        PERROR("Could not lock memory for Xen hypercall");
+        goto out;
+    }
+
+    if ((rc = xc_memory_op(xc_handle, XENMEM_numa_op, &memop)))
+    {
+        rc = -1;
+        xc_dom_printf("%s: XENMEM_machine_numa_layout failed\n", __FUNCTION__);
+        goto unlock;
+    }
+
+    layout->max_nodes = memop.u.minfo.max_nodes;
+    for (i = 0; i < layout->max_nodes; i++)
+    {
+        xc_dom_printf("mnode[%d] : size(%lu MB), free(%lu MB)\n",
+                                layout->node_data[i].node_id,
+                                (layout->node_data[i].node_memsize/MB),
+                                (layout->node_data[i].node_memfree/MB));
+        layout->memsize += layout->node_data[i].node_memsize;
+        layout->memfree += layout->node_data[i].node_memfree;
+    }
+
+unlock:
+    unlock_pages(&memop, sizeof(struct xenmem_numa_op));
+out:
+    return rc;
+}
+
+static int
+xc_get_max_vcpus(int xc_handle, uint32_t domid)
+{
+    DECLARE_DOMCTL;
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)domid;
+    return ((do_domctl(xc_handle, &domctl) < 0)
+            ? 0 : (domctl.u.getdomaininfo.max_vcpu_id+1));
+}
+
+/* Static NUMA distribution :
+ * Guest not compiled for NUMA (numa_kernel=0 in config)
+ * [1] (max_vnodes==1) => CONFINED
+ * [2] (max_vnodes>1) => STRIPED
+ * Guest compiled for NUMA (numa_kernel=1 in config)
+ * [1] (max_vnodes==1) => CONFINED 
+ * [2] (max_vnodes>1 && max_vcpus<max_vnodes) => STRIPED 
+ * [3] (max_vnodes>1 && max_vcpus>max_vnodes) => SPLIT
+ * We allocate the memory from top node (node with max avail memory) for
+ * NUMA-aware guests. For the other guests, whether or not compiled with
+ * NUMA, we allocate from bottom node (node with min avail memory). This
+ * allows for a static technique, where the fragmentation (within a node)
+ * can be kept to minimum.
+ *
+ * Dynamic NUMA distribution :
+ * In the future, we should also allow for dynamic balancing techniques -
+ * migration of VMs between nodes - to reduce striping as much as possible.
+ */
+
+/* Assuming (numa_kernel==1) for now 
+ * XXX: We should use an elf-xen-feature-note instead, which is 
+ * set based on CONFIG_NUMA and CONFIG_NUMA_EMU
+ */
+
+/* For the numa-aware guests, we would like to present a symmetrical
+ * topology in terms of the distribution of computing resources over the
+ * virtual nodes (of memory).
+ * We require the numa-aware guests to have (2^n) vcpus, so that
+ * the distribution over the nodes can be done as symmetrically
+ * as possible. We find the min (2^k) nodes which can fit in entire 
+ * domain's memory. Each of the kmin nodes are then assigned 2^(n-k) vcpus,
+ * where (n>=k). Ofcourse, it is possible to use any other distribution
+ * by just modifying the selection function.
+ */
+
+static uint64_t node_pages_selected[XC_MAX_NODES];
+
+/* The function makes a (greedy) best fit selection of num_vnodes of
+ * vnode_size each. The number of pages selected from each node are returned
+ * in the node_pages_selected array.
+ * The best_fit ranking is based on the fraction(up to 1024 parts) of node
+ * memory occupied, if the node is selected.
+ * Returns 0 on success and 1 if selection fails. */
+static int xc_select_best_fit_nodes(xc_machine_numa_layout_t *phys_layout,
+                        uint32_t num_vnodes, uint64_t vnode_pages,
+                            uint64_t *nodes_pages, uint32_t page_shift)
+{
+    int i, num_nodes_selected;
+    uint64_t best_fit_rank;
+
+    xc_dom_printf("%s: called\n", __FUNCTION__);
+#define INVALID_NODE (~0)
+#define NODE_FIT_RANK_SHIFT (10)
+    best_fit_rank = 0;
+
+    do {
+        int selected_node = INVALID_NODE;
+        for (i=0; i<phys_layout->max_nodes; i++)
+        {
+            struct xenmem_node_data *node_data;
+            uint64_t node_sizepages, node_freepages;
+            uint64_t node_fit_rank;
+
+            /* Node is already selected */
+            if (nodes_pages[i])
+                continue;
+
+            node_data = &phys_layout->node_data[i];
+            node_sizepages = (node_data->node_memsize >> page_shift);
+            node_freepages = (node_data->node_memfree >> page_shift);
+
+            if (node_freepages < vnode_pages)
+                continue;
+
+            node_fit_rank = ((node_sizepages-node_freepages-vnode_pages)
+                                    << NODE_FIT_RANK_SHIFT) / node_sizepages;
+
+            if (node_fit_rank > best_fit_rank)
+                selected_node = i;
+        }
+
+        /* Nodes could not be selected. Bail out ! */
+        if (selected_node == INVALID_NODE)
+            return -1;
+
+        nodes_pages[selected_node] = vnode_pages;
+        num_nodes_selected++;
+    } while(num_nodes_selected < num_vnodes);
+#undef NODE_FIT_RANK_SHIFT
+#undef INVALID_NODE
+    return 0;
+}
+
+/* Sort the phys nodes in the decreasing order of free node memory */
+static void xc_sort_nodeload(xc_machine_numa_layout_t *phys_layout)
+{
+    int i, j;
+    uint32_t max_nodes;
+
+    max_nodes = phys_layout->max_nodes;
+
+    for (i = 0; i < max_nodes; i++)
+    {
+        uint64_t i_node_memfree = phys_layout->node_data[i].node_memfree; 
+        for (j = i+1; j < max_nodes; j++)
+        {
+            uint64_t j_node_memfree = phys_layout->node_data[j].node_memfree; 
+            if (i_node_memfree > j_node_memfree)
+            {
+                struct xenmem_node_data tmp_node_data;
+                tmp_node_data = phys_layout->node_data[i];
+                phys_layout->node_data[i] = phys_layout->node_data[j];
+                phys_layout->node_data[j] = tmp_node_data;
+            }
+        }
+    }
+
+    return;
+}
+
+/* The function selects the nodes in the increasing order of free node memory,
+ * and fills them. The physical memory map for such a domain is striped 
+ * across all the selected nodes. 
+ * The phys_layout node_data structures could be sorted inplace. So, we 
+ * should always use node_data->node_id while using the node_distance array. */
+#define XC_NODE_MIN_FREESIZE    (64 * MB)
+static int xc_select_max_fit_nodes(xc_machine_numa_layout_t *phys_layout,
+        uint64_t pv_dom_pages, uint64_t *node_pages, uint32_t page_shift)
+{
+    int i;
+    uint64_t pv_alloc_pages;
+
+    xc_dom_printf("%s: called\n", __FUNCTION__);
+    xc_sort_nodeload(phys_layout);
+
+    pv_alloc_pages = 0;
+    for (i=0; i<phys_layout->max_nodes; i++)
+    {
+        struct xenmem_node_data *node_data;
+        uint64_t node_freepages, node_memfree;
+
+        node_data = &phys_layout->node_data[i];
+
+        /* In max-fit, if we try to pack the nodes too aggressively
+         * we might fail on any small allocation (from xen node heaps) */
+        node_memfree = node_data->node_memfree;
+        if (node_memfree <= XC_NODE_MIN_FREESIZE)
+            continue;
+        node_memfree -= XC_NODE_MIN_FREESIZE;
+
+        node_freepages = (node_memfree >> page_shift);
+        if (!node_freepages)
+            continue;
+
+        if (node_freepages > (pv_dom_pages-pv_alloc_pages))
+            node_freepages = (pv_dom_pages-pv_alloc_pages);
+
+        node_pages[i] = node_freepages;
+        pv_alloc_pages += node_freepages;
+    }
+
+    if (pv_alloc_pages != pv_dom_pages)
+    {
+        xc_dom_printf(
+                "%s: Failed to allocate memory. Maybe had to balloon more\n",
+                __FUNCTION__);
+        return -1;
+    }
+
+
+    return 0;
+}
+
+/* Policies for node selection need more research/experience.
+ * Also, live migration of the VMs (to other nodes) could provide 
+ * periodic load balancing across the nodes. */
+#define XC_VNODE_MIN_SIZE (128 * MB)
+static int xc_select_domain_vnodes(struct xc_dom_image *dom,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *pv_layout)
+{
+    int i;
+    uint32_t page_shift, numa_strategy;
+    xen_pfn_t pv_dom_pages;
+
+    pv_dom_pages = dom->total_pages;
+    page_shift = XC_DOM_PAGE_SHIFT(dom);
+
+    if (pv_dom_pages > (phys_layout->memfree >> page_shift))
+    {
+        xc_dom_printf(
+            "%s: Not enough memory for pv (unlikely after balloon checks)\n",
+                __FUNCTION__);
+        return -1;
+    }
+
+    /* Attempt to confine the VM or split the guest (and make it numa aware) */
+    for (i = 1; i <= phys_layout->max_nodes; i<<=1)
+    {
+        uint64_t vnode_size_pages;
+
+        memset(node_pages_selected, 0, sizeof(node_pages_selected));
+
+        vnode_size_pages = pv_dom_pages/i;
+        if ((vnode_size_pages << page_shift) < XC_VNODE_MIN_SIZE)
+            break;
+
+        if (!xc_select_best_fit_nodes(phys_layout, i, vnode_size_pages, 
+                                            node_pages_selected, page_shift))
+        {
+            if (i > 1)
+                numa_strategy = XEN_DOM_NUMA_SPLIT;
+            else
+                numa_strategy = XEN_DOM_NUMA_CONFINED;
+            goto selection_done;
+        }
+    
+    }
+
+    /* The order of nodes in phys_layout could change */
+    memset(node_pages_selected, 0, sizeof(node_pages_selected));
+    if (!xc_select_max_fit_nodes(phys_layout, pv_dom_pages,
+                                    node_pages_selected, page_shift))
+    {
+        numa_strategy = XEN_DOM_NUMA_STRIPED;
+        goto selection_done;
+    }
+
+    xc_dom_printf(
+        "%s: Not enough memory for pv (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    return -1;
+
+selection_done:
+
+    pv_layout->version = XEN_DOM_NUMA_INTERFACE_VERSION;
+    pv_layout->type = numa_strategy;
+
+    if (!(pv_layout->max_vcpus =
+                xc_get_max_vcpus(dom->guest_xc, dom->guest_domid)))
+    {
+        xc_dom_printf("%s: xc_get_max_vcpus failed !\n", __FUNCTION__);
+        return -1; 
+    }
+
+    for (i = 0; i < phys_layout->max_nodes; i++)
+    {
+        struct xenmem_node_data *node_data;
+        struct xen_vnode_data *vnode_data;
+        uint32_t vnode_id;
+
+        if (!node_pages_selected[i])
+            continue;
+
+        node_data = &phys_layout->node_data[i];
+        vnode_id = pv_layout->max_vnodes;
+        vnode_data = &pv_layout->vnode_data[vnode_id];
+        vnode_data->vnode_id = vnode_id;
+        vnode_data->nr_pages = node_pages_selected[i];
+        vnode_data->mnode_id = node_data->node_id;
+
+        pv_layout->max_vnodes++;
+        /* vnode_data->vcpu_mask is set later when distributing the
+         * vcpus over vnodes and assigning affinities */
+    }
+
+    dom->numa_layout = pv_layout;
+    return 0;
+}
+
+int xc_setup_domain_numa_layout(struct xc_dom_image *dom)
+{
+    int xc_handle;
+    xc_machine_numa_layout_t *phys_layout;
+    xc_domain_numa_layout_t *pv_layout;
+
+    xc_dom_printf("%s: called\n", __FUNCTION__);
+
+    xc_handle = dom->guest_xc;
+    phys_layout = &phys_numa_layout;
+
+    if (xc_read_machine_numa_layout(xc_handle, phys_layout))
+    {
+        xc_dom_printf( "%s: xc_read_machine_numa_layout failed\n",
+                                                            __FUNCTION__);
+        return -1; 
+    }
+
+    /* Allocate pv_numa_layout dynamically for VMs */
+    pv_layout = &pv_numa_layout;
+    memset(pv_layout, 0, sizeof(*pv_layout));
+
+    if (xc_select_domain_vnodes(dom, phys_layout, pv_layout))
+    {
+        xc_dom_printf("%s: xc_select_domain_vnodes failed\n", __FUNCTION__);
+        return -1;
+    }
+
+    dump_guest_numa_layout(dom, pv_layout);
+    /* pv_layout is used only temporarily - XENMEM_numa_op to set the 
+     * numa_layout for the domain */
+    return 0;
+}
+
+/*************************************************************************/
+/* node lookup for mfns */
+#define NUMA_NO_NODE 0xFF
+#define MEMNODE_BUFSIZE (PAGE_SIZE)
+
+static uint8_t memnode_buf[MEMNODE_BUFSIZE];
+
+static int mfn_to_nid(struct xenmem_machine_nodemap *memnode, xen_pfn_t mfn)
+{ 
+	unsigned nid;
+    uint8_t *memnode_map;
+    xen_paddr_t addr;
+
+    addr = pfn_to_paddr(mfn);
+	if((addr >> memnode->shift) >= memnode->mapsize)
+    {
+        xc_dom_printf("(addr(%lx) >> memnode_shift) >= NODEMAPSIZE", addr);
+        return 0;
+    }
+    get_xen_guest_handle(memnode_map, memnode->map);
+	nid = memnode_map[addr >> memnode->shift]; 
+	return nid; 
+}
+
+int xc_domain_nodemem_distribution(struct xc_dom_image *dom)
+{
+    int xc_handle;
+    int rc, nid, bad_nid;
+    xen_pfn_t *p2m, pfn, max_pfn;
+    uint32_t node[XC_MAX_NODES];
+    struct xenmem_numa_op memop;
+
+    xc_dom_printf("%s: nodemem distribution for domain %d\n", 
+                                        __FUNCTION__, dom->guest_domid);
+
+    xc_handle = dom->guest_xc;
+    p2m = dom->p2m_host;
+    max_pfn = dom->total_pages;
+
+    memop.cmd = XENMEM_machine_nodemap;
+    memop.u.mnodemap.bufsize = MEMNODE_BUFSIZE;
+    memset(memnode_buf, NUMA_NO_NODE, MEMNODE_BUFSIZE);
+    set_xen_guest_handle(memop.u.mnodemap.map, memnode_buf);
+
+    if ( lock_pages(&memop, sizeof(struct xenmem_numa_op)) ||
+            lock_pages(memnode_buf, MEMNODE_BUFSIZE))
+    {
+        rc = -1;
+        PERROR("Could not lock memory for Xen hypercall");
+        goto out;
+    }
+
+    if ((rc = xc_memory_op(xc_handle, XENMEM_numa_op, &memop)))
+    {
+        rc = -1;
+        xc_dom_printf("%s: XENMEM_memnode_map failed\n", __FUNCTION__);
+        goto unlock;
+    }
+
+    bad_nid = 0;
+    for (nid=0; nid<XC_MAX_NODES; nid++)
+        node[nid] = 0;
+    for (pfn=0; pfn<max_pfn; pfn++)
+    {
+        int nid = mfn_to_nid(&memop.u.mnodemap, p2m[pfn]);
+        if ((nid < 0) || (nid >= XC_MAX_NODES))
+        {
+            bad_nid++;
+            continue;
+        }
+        node[nid]++;
+    }
+    for (nid=0; nid<XC_MAX_NODES; nid++)
+    {
+        if (node[nid])
+            xc_dom_printf("node[%d] = %u\n", nid, node[nid]);
+    }
+    xc_dom_printf("bad node = %u\n",bad_nid);
+
+    rc = 0;
+unlock:
+    unlock_pages(&memop, sizeof(struct xenmem_numa_op));
+    unlock_pages(memnode_buf, MEMNODE_BUFSIZE);
+out:
+    return rc;
+}
+
diff -r c636287eab3c -r d6b7abf0d2d5 tools/libxc/xc_dom_numa.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_numa.h	Wed Mar 31 19:39:58 2010 -0400
@@ -0,0 +1,21 @@
+#ifndef __XC_DOM_NUMA_H
+#define __XC_DOM_NUMA_H
+
+#define XC_MAX_NODES 8
+
+struct xc_machine_numa_layout {
+    uint64_t memsize;
+    uint64_t memfree;
+
+    uint32_t max_nodes;
+    uint32_t node_distance[XC_MAX_NODES*XC_MAX_NODES];
+    struct xenmem_node_data node_data[XC_MAX_NODES];
+};
+typedef struct xc_machine_numa_layout xc_machine_numa_layout_t;
+typedef struct xen_domain_numa_layout xc_domain_numa_layout_t;
+
+extern int xc_setup_domain_numa_layout(struct xc_dom_image *dom);
+extern int xc_domain_nodemem_distribution(struct xc_dom_image *dom);
+extern void dump_guest_numa_layout(struct xc_dom_image *dom,
+                                        xc_domain_numa_layout_t *layout);
+#endif