diff -r c636287eab3c -r d6b7abf0d2d5 tools/libxc/Makefile --- a/tools/libxc/Makefile Tue Mar 30 21:18:25 2010 -0400 +++ b/tools/libxc/Makefile Wed Mar 31 19:39:58 2010 -0400 @@ -25,6 +25,7 @@ CTRL_SRCS-y += xc_mem_event.c CTRL_SRCS-y += xc_mem_paging.c CTRL_SRCS-y += xc_memshr.c +CTRL_SRCS-y += xc_dom_numa.c CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c CTRL_SRCS-$(CONFIG_SunOS) += xc_solaris.c diff -r c636287eab3c -r d6b7abf0d2d5 tools/libxc/xc_dom.h --- a/tools/libxc/xc_dom.h Tue Mar 30 21:18:25 2010 -0400 +++ b/tools/libxc/xc_dom.h Wed Mar 31 19:39:58 2010 -0400 @@ -1,4 +1,5 @@ #include +#include #define INVALID_P2M_ENTRY ((xen_pfn_t)-1) @@ -102,6 +103,7 @@ int xen_version; xen_capabilities_info_t xen_caps; + struct xen_domain_numa_layout *numa_layout; /* kernel loader, arch hooks */ struct xc_dom_loader *kernel_loader; void *private_loader; diff -r c636287eab3c -r d6b7abf0d2d5 tools/libxc/xc_dom_core.c --- a/tools/libxc/xc_dom_core.c Tue Mar 30 21:18:25 2010 -0400 +++ b/tools/libxc/xc_dom_core.c Wed Mar 31 19:39:58 2010 -0400 @@ -19,6 +19,7 @@ #include "xg_private.h" #include "xc_dom.h" +#include "xc_dom_numa.h" /* ------------------------------------------------------------------------ */ /* debugging */ @@ -678,8 +679,13 @@ __FUNCTION__, mem_mb, nr_pages, 1 << (page_shift-10)); dom->total_pages = nr_pages; - xc_dom_printf("%s: 0x%" PRIpfn " pages\n", - __FUNCTION__, dom->total_pages); + xc_dom_printf("%s: 0x%" PRIpfn " pages\n", __FUNCTION__, dom->total_pages); + + if (xc_setup_domain_numa_layout(dom)) + { + /* Ignore the error and proceed as non-numa guest */ + xc_dom_printf("%s: xc_setup_domain_layout failed\n", __FUNCTION__); + } return 0; } diff -r c636287eab3c -r d6b7abf0d2d5 tools/libxc/xc_dom_numa.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_dom_numa.c Wed Mar 31 19:39:58 2010 -0400 @@ -0,0 +1,512 @@ +/* XEN Guest NUMA (memory placement) + * Author : Dulloor (dulloor@xxxxxxxxxx) */ + +#include +#include "xg_private.h" +#include "xc_dom.h" +#include "xc_dom_numa.h" + +#define KB (1024) +#define MB (KB * KB) + +#define pfn_to_paddr(pfn) ((xen_paddr_t)(pfn) << PAGE_SHIFT_X86) + +/* numa layout structures */ +xc_machine_numa_layout_t phys_numa_layout; +xc_domain_numa_layout_t pv_numa_layout; + +void dump_guest_numa_layout(struct xc_dom_image *dom, + xc_domain_numa_layout_t *layout) +{ + unsigned int i, j; + char vcpumask[128] = ""; + xc_dom_printf("%s called :\n NUMA-LAYOUT(Dom%d) : vcpus(%u), vnodes(%u)", + __FUNCTION__, dom->guest_domid, layout->max_vcpus, layout->max_vnodes); + switch (layout->type) + { + case XEN_DOM_NUMA_CONFINED: + xc_dom_printf(", type(CONFINED)\n"); + break; + case XEN_DOM_NUMA_SPLIT: + xc_dom_printf(", type(SPLIT)\n"); + break; + case XEN_DOM_NUMA_STRIPED: + xc_dom_printf(", type(STRIPED)\n"); + break; + default: + xc_dom_printf(", type(UNDEFINED)\n"); + } + for (i = 0; i < layout->max_vnodes; i++) + { + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; +#if 0 + xc_cpumask_scnprintf(vcpumask, sizeof(vcpumask), + *((cpumask_t *)&vnode_data->vcpu_mask)); +#endif + xc_dom_printf("vnode[%u]:mnode(%u), node_nr_pages(%lx), \ + vcpu_mask(%s)\n", vnode_data->vnode_id, vnode_data->mnode_id, + vnode_data->nr_pages, vcpumask); + } + + xc_dom_printf("vnode distances :\n"); + for (i = 0; i < layout->max_vnodes; i++) + xc_dom_printf("\tvnode[%u]", i); + for (i = 0; i < layout->max_vnodes; i++) + { + xc_dom_printf("\nvnode[%u]", i); + for (j = 0; j < layout->max_vnodes; j++) + xc_dom_printf("\t%u", + layout->vnode_distance[i*layout->max_vnodes + j]); + xc_dom_printf("\n"); + } + return; +} + +static int +xc_read_machine_numa_layout(int xc_handle, xc_machine_numa_layout_t *layout) +{ + int rc, i; + struct xenmem_numa_op memop; + + xc_dom_printf("%s: called\n", __FUNCTION__); + + memset(layout, 0, sizeof(*layout)); + + memop.cmd = XENMEM_machine_numa_layout; + memop.u.minfo.max_nodes = XC_MAX_NODES; + set_xen_guest_handle(memop.u.minfo.node_distance, layout->node_distance); + set_xen_guest_handle(memop.u.minfo.node_data, layout->node_data); + + rc = 0; + if (lock_pages(&memop, sizeof(struct xenmem_numa_op)) || + lock_pages(layout, sizeof(*layout))) + { + rc = -1; + PERROR("Could not lock memory for Xen hypercall"); + goto out; + } + + if ((rc = xc_memory_op(xc_handle, XENMEM_numa_op, &memop))) + { + rc = -1; + xc_dom_printf("%s: XENMEM_machine_numa_layout failed\n", __FUNCTION__); + goto unlock; + } + + layout->max_nodes = memop.u.minfo.max_nodes; + for (i = 0; i < layout->max_nodes; i++) + { + xc_dom_printf("mnode[%d] : size(%lu MB), free(%lu MB)\n", + layout->node_data[i].node_id, + (layout->node_data[i].node_memsize/MB), + (layout->node_data[i].node_memfree/MB)); + layout->memsize += layout->node_data[i].node_memsize; + layout->memfree += layout->node_data[i].node_memfree; + } + +unlock: + unlock_pages(&memop, sizeof(struct xenmem_numa_op)); +out: + return rc; +} + +static int +xc_get_max_vcpus(int xc_handle, uint32_t domid) +{ + DECLARE_DOMCTL; + domctl.cmd = XEN_DOMCTL_getdomaininfo; + domctl.domain = (domid_t)domid; + return ((do_domctl(xc_handle, &domctl) < 0) + ? 0 : (domctl.u.getdomaininfo.max_vcpu_id+1)); +} + +/* Static NUMA distribution : + * Guest not compiled for NUMA (numa_kernel=0 in config) + * [1] (max_vnodes==1) => CONFINED + * [2] (max_vnodes>1) => STRIPED + * Guest compiled for NUMA (numa_kernel=1 in config) + * [1] (max_vnodes==1) => CONFINED + * [2] (max_vnodes>1 && max_vcpus STRIPED + * [3] (max_vnodes>1 && max_vcpus>max_vnodes) => SPLIT + * We allocate the memory from top node (node with max avail memory) for + * NUMA-aware guests. For the other guests, whether or not compiled with + * NUMA, we allocate from bottom node (node with min avail memory). This + * allows for a static technique, where the fragmentation (within a node) + * can be kept to minimum. + * + * Dynamic NUMA distribution : + * In the future, we should also allow for dynamic balancing techniques - + * migration of VMs between nodes - to reduce striping as much as possible. + */ + +/* Assuming (numa_kernel==1) for now + * XXX: We should use an elf-xen-feature-note instead, which is + * set based on CONFIG_NUMA and CONFIG_NUMA_EMU + */ + +/* For the numa-aware guests, we would like to present a symmetrical + * topology in terms of the distribution of computing resources over the + * virtual nodes (of memory). + * We require the numa-aware guests to have (2^n) vcpus, so that + * the distribution over the nodes can be done as symmetrically + * as possible. We find the min (2^k) nodes which can fit in entire + * domain's memory. Each of the kmin nodes are then assigned 2^(n-k) vcpus, + * where (n>=k). Ofcourse, it is possible to use any other distribution + * by just modifying the selection function. + */ + +static uint64_t node_pages_selected[XC_MAX_NODES]; + +/* The function makes a (greedy) best fit selection of num_vnodes of + * vnode_size each. The number of pages selected from each node are returned + * in the node_pages_selected array. + * The best_fit ranking is based on the fraction(up to 1024 parts) of node + * memory occupied, if the node is selected. + * Returns 0 on success and 1 if selection fails. */ +static int xc_select_best_fit_nodes(xc_machine_numa_layout_t *phys_layout, + uint32_t num_vnodes, uint64_t vnode_pages, + uint64_t *nodes_pages, uint32_t page_shift) +{ + int i, num_nodes_selected; + uint64_t best_fit_rank; + + xc_dom_printf("%s: called\n", __FUNCTION__); +#define INVALID_NODE (~0) +#define NODE_FIT_RANK_SHIFT (10) + best_fit_rank = 0; + + do { + int selected_node = INVALID_NODE; + for (i=0; imax_nodes; i++) + { + struct xenmem_node_data *node_data; + uint64_t node_sizepages, node_freepages; + uint64_t node_fit_rank; + + /* Node is already selected */ + if (nodes_pages[i]) + continue; + + node_data = &phys_layout->node_data[i]; + node_sizepages = (node_data->node_memsize >> page_shift); + node_freepages = (node_data->node_memfree >> page_shift); + + if (node_freepages < vnode_pages) + continue; + + node_fit_rank = ((node_sizepages-node_freepages-vnode_pages) + << NODE_FIT_RANK_SHIFT) / node_sizepages; + + if (node_fit_rank > best_fit_rank) + selected_node = i; + } + + /* Nodes could not be selected. Bail out ! */ + if (selected_node == INVALID_NODE) + return -1; + + nodes_pages[selected_node] = vnode_pages; + num_nodes_selected++; + } while(num_nodes_selected < num_vnodes); +#undef NODE_FIT_RANK_SHIFT +#undef INVALID_NODE + return 0; +} + +/* Sort the phys nodes in the decreasing order of free node memory */ +static void xc_sort_nodeload(xc_machine_numa_layout_t *phys_layout) +{ + int i, j; + uint32_t max_nodes; + + max_nodes = phys_layout->max_nodes; + + for (i = 0; i < max_nodes; i++) + { + uint64_t i_node_memfree = phys_layout->node_data[i].node_memfree; + for (j = i+1; j < max_nodes; j++) + { + uint64_t j_node_memfree = phys_layout->node_data[j].node_memfree; + if (i_node_memfree > j_node_memfree) + { + struct xenmem_node_data tmp_node_data; + tmp_node_data = phys_layout->node_data[i]; + phys_layout->node_data[i] = phys_layout->node_data[j]; + phys_layout->node_data[j] = tmp_node_data; + } + } + } + + return; +} + +/* The function selects the nodes in the increasing order of free node memory, + * and fills them. The physical memory map for such a domain is striped + * across all the selected nodes. + * The phys_layout node_data structures could be sorted inplace. So, we + * should always use node_data->node_id while using the node_distance array. */ +#define XC_NODE_MIN_FREESIZE (64 * MB) +static int xc_select_max_fit_nodes(xc_machine_numa_layout_t *phys_layout, + uint64_t pv_dom_pages, uint64_t *node_pages, uint32_t page_shift) +{ + int i; + uint64_t pv_alloc_pages; + + xc_dom_printf("%s: called\n", __FUNCTION__); + xc_sort_nodeload(phys_layout); + + pv_alloc_pages = 0; + for (i=0; imax_nodes; i++) + { + struct xenmem_node_data *node_data; + uint64_t node_freepages, node_memfree; + + node_data = &phys_layout->node_data[i]; + + /* In max-fit, if we try to pack the nodes too aggressively + * we might fail on any small allocation (from xen node heaps) */ + node_memfree = node_data->node_memfree; + if (node_memfree <= XC_NODE_MIN_FREESIZE) + continue; + node_memfree -= XC_NODE_MIN_FREESIZE; + + node_freepages = (node_memfree >> page_shift); + if (!node_freepages) + continue; + + if (node_freepages > (pv_dom_pages-pv_alloc_pages)) + node_freepages = (pv_dom_pages-pv_alloc_pages); + + node_pages[i] = node_freepages; + pv_alloc_pages += node_freepages; + } + + if (pv_alloc_pages != pv_dom_pages) + { + xc_dom_printf( + "%s: Failed to allocate memory. Maybe had to balloon more\n", + __FUNCTION__); + return -1; + } + + + return 0; +} + +/* Policies for node selection need more research/experience. + * Also, live migration of the VMs (to other nodes) could provide + * periodic load balancing across the nodes. */ +#define XC_VNODE_MIN_SIZE (128 * MB) +static int xc_select_domain_vnodes(struct xc_dom_image *dom, + xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *pv_layout) +{ + int i; + uint32_t page_shift, numa_strategy; + xen_pfn_t pv_dom_pages; + + pv_dom_pages = dom->total_pages; + page_shift = XC_DOM_PAGE_SHIFT(dom); + + if (pv_dom_pages > (phys_layout->memfree >> page_shift)) + { + xc_dom_printf( + "%s: Not enough memory for pv (unlikely after balloon checks)\n", + __FUNCTION__); + return -1; + } + + /* Attempt to confine the VM or split the guest (and make it numa aware) */ + for (i = 1; i <= phys_layout->max_nodes; i<<=1) + { + uint64_t vnode_size_pages; + + memset(node_pages_selected, 0, sizeof(node_pages_selected)); + + vnode_size_pages = pv_dom_pages/i; + if ((vnode_size_pages << page_shift) < XC_VNODE_MIN_SIZE) + break; + + if (!xc_select_best_fit_nodes(phys_layout, i, vnode_size_pages, + node_pages_selected, page_shift)) + { + if (i > 1) + numa_strategy = XEN_DOM_NUMA_SPLIT; + else + numa_strategy = XEN_DOM_NUMA_CONFINED; + goto selection_done; + } + + } + + /* The order of nodes in phys_layout could change */ + memset(node_pages_selected, 0, sizeof(node_pages_selected)); + if (!xc_select_max_fit_nodes(phys_layout, pv_dom_pages, + node_pages_selected, page_shift)) + { + numa_strategy = XEN_DOM_NUMA_STRIPED; + goto selection_done; + } + + xc_dom_printf( + "%s: Not enough memory for pv (Had to balloon more ?)\n", + __FUNCTION__); + return -1; + +selection_done: + + pv_layout->version = XEN_DOM_NUMA_INTERFACE_VERSION; + pv_layout->type = numa_strategy; + + if (!(pv_layout->max_vcpus = + xc_get_max_vcpus(dom->guest_xc, dom->guest_domid))) + { + xc_dom_printf("%s: xc_get_max_vcpus failed !\n", __FUNCTION__); + return -1; + } + + for (i = 0; i < phys_layout->max_nodes; i++) + { + struct xenmem_node_data *node_data; + struct xen_vnode_data *vnode_data; + uint32_t vnode_id; + + if (!node_pages_selected[i]) + continue; + + node_data = &phys_layout->node_data[i]; + vnode_id = pv_layout->max_vnodes; + vnode_data = &pv_layout->vnode_data[vnode_id]; + vnode_data->vnode_id = vnode_id; + vnode_data->nr_pages = node_pages_selected[i]; + vnode_data->mnode_id = node_data->node_id; + + pv_layout->max_vnodes++; + /* vnode_data->vcpu_mask is set later when distributing the + * vcpus over vnodes and assigning affinities */ + } + + dom->numa_layout = pv_layout; + return 0; +} + +int xc_setup_domain_numa_layout(struct xc_dom_image *dom) +{ + int xc_handle; + xc_machine_numa_layout_t *phys_layout; + xc_domain_numa_layout_t *pv_layout; + + xc_dom_printf("%s: called\n", __FUNCTION__); + + xc_handle = dom->guest_xc; + phys_layout = &phys_numa_layout; + + if (xc_read_machine_numa_layout(xc_handle, phys_layout)) + { + xc_dom_printf( "%s: xc_read_machine_numa_layout failed\n", + __FUNCTION__); + return -1; + } + + /* Allocate pv_numa_layout dynamically for VMs */ + pv_layout = &pv_numa_layout; + memset(pv_layout, 0, sizeof(*pv_layout)); + + if (xc_select_domain_vnodes(dom, phys_layout, pv_layout)) + { + xc_dom_printf("%s: xc_select_domain_vnodes failed\n", __FUNCTION__); + return -1; + } + + dump_guest_numa_layout(dom, pv_layout); + /* pv_layout is used only temporarily - XENMEM_numa_op to set the + * numa_layout for the domain */ + return 0; +} + +/*************************************************************************/ +/* node lookup for mfns */ +#define NUMA_NO_NODE 0xFF +#define MEMNODE_BUFSIZE (PAGE_SIZE) + +static uint8_t memnode_buf[MEMNODE_BUFSIZE]; + +static int mfn_to_nid(struct xenmem_machine_nodemap *memnode, xen_pfn_t mfn) +{ + unsigned nid; + uint8_t *memnode_map; + xen_paddr_t addr; + + addr = pfn_to_paddr(mfn); + if((addr >> memnode->shift) >= memnode->mapsize) + { + xc_dom_printf("(addr(%lx) >> memnode_shift) >= NODEMAPSIZE", addr); + return 0; + } + get_xen_guest_handle(memnode_map, memnode->map); + nid = memnode_map[addr >> memnode->shift]; + return nid; +} + +int xc_domain_nodemem_distribution(struct xc_dom_image *dom) +{ + int xc_handle; + int rc, nid, bad_nid; + xen_pfn_t *p2m, pfn, max_pfn; + uint32_t node[XC_MAX_NODES]; + struct xenmem_numa_op memop; + + xc_dom_printf("%s: nodemem distribution for domain %d\n", + __FUNCTION__, dom->guest_domid); + + xc_handle = dom->guest_xc; + p2m = dom->p2m_host; + max_pfn = dom->total_pages; + + memop.cmd = XENMEM_machine_nodemap; + memop.u.mnodemap.bufsize = MEMNODE_BUFSIZE; + memset(memnode_buf, NUMA_NO_NODE, MEMNODE_BUFSIZE); + set_xen_guest_handle(memop.u.mnodemap.map, memnode_buf); + + if ( lock_pages(&memop, sizeof(struct xenmem_numa_op)) || + lock_pages(memnode_buf, MEMNODE_BUFSIZE)) + { + rc = -1; + PERROR("Could not lock memory for Xen hypercall"); + goto out; + } + + if ((rc = xc_memory_op(xc_handle, XENMEM_numa_op, &memop))) + { + rc = -1; + xc_dom_printf("%s: XENMEM_memnode_map failed\n", __FUNCTION__); + goto unlock; + } + + bad_nid = 0; + for (nid=0; nid= XC_MAX_NODES)) + { + bad_nid++; + continue; + } + node[nid]++; + } + for (nid=0; nid