[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v4 6/7] libxl: vNUMA supporting interface



* Checks and sets if vnode to physical node mask was incorrectly
defined. If yes and vnuma_placement is set to 1, tries use automatic
NUMA placement machanism, otherwise falls to default mask VNUMA_NO_NODE.
If user define allocation map can be used based on memory requirements,
disables automatic numa placement.
* Verifies the correctness of memory ranges pfns for PV guest
by requesting the e820 map for that domain, takes into account e820_host
config option;
* Provides vNUMA topology information to Xen about vNUMA topology
and allocation map used for vnodes;

Comment on e820 map and memory alignment:

When e820_host is not set, then pv guest has fixed e820 map:

[    0.000000] e820: BIOS-provided physical RAM map:
[    0.000000] Xen: [mem 0x0000000000000000-0x000000000009ffff] usable
[    0.000000] Xen: [mem 0x00000000000a0000-0x00000000000fffff] reserved
[    0.000000] Xen: [mem 0x0000000000100000-0x00000000f9ffffff] usable
[    0.000000] e820: update [mem 0x00000000-0x00000fff] usable ==>
reserved
[    0.000000] e820: remove [mem 0x000a0000-0x000fffff] usable

Means, first 4KB (0x0000 - 0x0fff) and 384K gap between 0xa0000 and
0xfffff will be reserved. Since these pfns will never appear in the
pages allocations and the the beginning and end of memory blocks
In this case memory ranges for guest are constructed based on sizes of
vnodes.

In case e820_host is set to 1, the memory holes should be taken into
account.

Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
---
 tools/libxl/libxl.c          |   18 +++++
 tools/libxl/libxl.h          |   20 ++++++
 tools/libxl/libxl_arch.h     |    6 ++
 tools/libxl/libxl_dom.c      |  158 +++++++++++++++++++++++++++++++++++++-----
 tools/libxl/libxl_internal.h |    6 ++
 tools/libxl/libxl_numa.c     |   49 +++++++++++++
 tools/libxl/libxl_x86.c      |  123 ++++++++++++++++++++++++++++++++
 7 files changed, 363 insertions(+), 17 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 9b93262..4b67640 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -4658,6 +4658,24 @@ static int libxl__set_vcpuonline_qmp(libxl__gc *gc, 
uint32_t domid,
     return 0;
 }
 
+int libxl_domain_setvnuma(libxl_ctx *ctx,
+                            uint32_t domid,
+                            uint16_t nr_vnodes,
+                            uint16_t nr_vcpus,
+                            vmemrange_t *vmemrange,
+                            unsigned int *vdistance,
+                            unsigned int *vcpu_to_vnode,
+                            unsigned int *vnode_to_pnode)
+{
+    int ret;
+    ret = xc_domain_setvnuma(ctx->xch, domid, nr_vnodes,
+                                nr_vcpus, vmemrange,
+                                vdistance,
+                                vcpu_to_vnode,
+                                vnode_to_pnode);
+    return ret;
+}
+
 int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap)
 {
     GC_INIT(ctx);
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index a9663e4..6087ddc 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -281,11 +281,14 @@
 #include <netinet/in.h>
 #include <sys/wait.h> /* for pid_t */
 
+#include <xen/memory.h>
 #include <xentoollog.h>
 
 #include <libxl_uuid.h>
 #include <_libxl_list.h>
 
+#include <xen/vnuma.h>
+
 /* API compatibility. */
 #ifdef LIBXL_API_VERSION
 #if LIBXL_API_VERSION != 0x040200 && LIBXL_API_VERSION != 0x040300 && \
@@ -391,6 +394,14 @@
 #define LIBXL_EXTERNAL_CALLERS_ONLY /* disappears for callers outside libxl */
 #endif
 
+/*
+ * LIBXL_HAVE_BUILDINFO_VNUMA indicates that vnuma topology will be
+ * build for the guest upon request and with VM configuration.
+ * It will try to define best allocation for vNUMA
+ * nodes on real NUMA nodes.
+ */
+#define LIBXL_HAVE_BUILDINFO_VNUMA 1
+
 typedef uint8_t libxl_mac[6];
 #define LIBXL_MAC_FMT "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx"
 #define LIBXL_MAC_FMTLEN ((2*6)+5) /* 6 hex bytes plus 5 colons */
@@ -750,6 +761,15 @@ void libxl_vcpuinfo_list_free(libxl_vcpuinfo *, int 
nr_vcpus);
 void libxl_device_vtpm_list_free(libxl_device_vtpm*, int nr_vtpms);
 void libxl_vtpminfo_list_free(libxl_vtpminfo *, int nr_vtpms);
 
+int libxl_domain_setvnuma(libxl_ctx *ctx,
+                           uint32_t domid,
+                           uint16_t nr_vnodes,
+                           uint16_t nr_vcpus,
+                           vmemrange_t *vmemrange,
+                           unsigned int *vdistance,
+                           unsigned int *vcpu_to_vnode,
+                           unsigned int *vnode_to_pnode);
+
 /*
  * Devices
  * =======
diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h
index aee0a91..9caf0ae 100644
--- a/tools/libxl/libxl_arch.h
+++ b/tools/libxl/libxl_arch.h
@@ -22,4 +22,10 @@ int libxl__arch_domain_create(libxl__gc *gc, 
libxl_domain_config *d_config,
 int libxl__arch_domain_configure(libxl__gc *gc,
                                  libxl_domain_build_info *info,
                                  struct xc_dom_image *dom);
+
+int libxl__vnuma_align_mem(libxl__gc *gc,
+                            uint32_t domid,
+                            struct libxl_domain_build_info *b_info,
+                            vmemrange_t *memblks);
+
 #endif
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 72489f8..5ff8218 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -23,6 +23,7 @@
 #include <xc_dom.h>
 #include <xen/hvm/hvm_info_table.h>
 #include <xen/hvm/hvm_xs_strings.h>
+#include <libxl_vnuma.h>
 
 libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid)
 {
@@ -201,6 +202,64 @@ static int numa_place_domain(libxl__gc *gc, uint32_t domid,
     return rc;
 }
 
+/* prepares vnode to pnode map for domain vNUMA memory allocation */
+int libxl__init_vnode_to_pnode(libxl__gc *gc, uint32_t domid,
+                        libxl_domain_build_info *info)
+{
+    int i, n, nr_nodes = 0, rc;
+    uint64_t *mems;
+    unsigned long long *claim = NULL;
+    libxl_numainfo *ninfo = NULL;
+
+    rc = ERROR_FAIL;
+
+    /* default setting */
+    for (i = 0; i < info->nr_vnodes; i++)
+        info->vnode_to_pnode[i] = VNUMA_NO_NODE;
+
+    /* Get NUMA info */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL) {
+        rc = 0;
+        goto vnmapout;
+    }
+
+    /*
+     * We dont try to build vnode_to_pnode map
+     * if info->cpumap is full what means that
+     * no nodemap was built.
+     */
+    if (libxl_bitmap_is_full(&info->nodemap)) {
+        LOG(DETAIL, "No suitable NUMA candidates were found for vnuma.\n");
+        rc = 0;
+        goto vnmapout;
+    }
+    mems = info->vnuma_memszs;
+    /*
+     * TODO: review the algorithm and imporove algorithm.
+     * If no p-node found, will be set to NUMA_NO_NODE
+     */
+    claim = libxl__calloc(gc, info->nr_vnodes, sizeof(*claim));
+
+    libxl_for_each_set_bit(n, info->nodemap)
+    {
+        for (i = 0; i < info->nr_vnodes; i++)
+        {
+            if (((claim[n] + (mems[i] << 20)) <= ninfo[n].free) &&
+                 /*vnode was not set yet */
+                 (info->vnode_to_pnode[i] == VNUMA_NO_NODE ) )
+            {
+                info->vnode_to_pnode[i] = n;
+                claim[n] += (mems[i] << 20);
+            }
+        }
+    }
+
+    rc = 0;
+ vnmapout:
+    return rc;
+}
+
 int libxl__build_pre(libxl__gc *gc, uint32_t domid,
               libxl_domain_config *d_config, libxl__domain_build_state *state)
 {
@@ -214,27 +273,70 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
         return ERROR_FAIL;
     }
 
-    /*
-     * Check if the domain has any CPU affinity. If not, try to build
-     * up one. In case numa_place_domain() find at least a suitable
-     * candidate, it will affect info->nodemap accordingly; if it
-     * does not, it just leaves it as it is. This means (unless
-     * some weird error manifests) the subsequent call to
-     * libxl_domain_set_nodeaffinity() will do the actual placement,
-     * whatever that turns out to be.
-     */
-    if (libxl_defbool_val(info->numa_placement)) {
+    if (info->nr_vnodes > 0) {
+        /* The memory blocks will be formed here from sizes */
+        struct vmemrange *memrange = libxl__calloc(gc, info->nr_vnodes,
+                                                sizeof(*memrange));
 
-        if (!libxl_bitmap_is_full(&info->cpumap)) {
-            LOG(ERROR, "Can run NUMA placement only if no vcpu "
-                       "affinity is specified");
-            return ERROR_INVAL;
+        if (libxl__vnuma_align_mem(gc, domid, info, memrange) < 0) {
+            LOG(DETAIL, "Failed to align memory map.\n");
+            return ERROR_FAIL;
+        }
+
+        /*
+        * If vNUMA vnode_to_pnode map defined, determine if we
+        * can disable automatic numa placement and place vnodes
+        * on specified pnodes.
+        * For now, if vcpu affinity specified, we will use
+        * specified vnode to pnode map.
+        */
+
+        /* will be used default numa placement? */
+        if (libxl_defbool_val(info->vnuma_placement)) {
+            /*
+             * Check if the domain has any CPU affinity. If not, try to build
+             * up one. In case numa_place_domain() find at least a suitable
+             * candidate, it will affect info->nodemap accordingly; if it
+             * does not, it just leaves it as it is. This means (unless
+             * some weird error manifests) the subsequent call to
+             * libxl_domain_set_nodeaffinity() will do the actual placement,
+             * whatever that turns out to be.
+             */
+             if (libxl_defbool_val(info->numa_placement)) {
+                if (!libxl_bitmap_is_full(&info->cpumap)) {
+                    LOG(ERROR, "Can run NUMA placement only if no vcpu "
+                               "affinity is specified");
+                    return ERROR_INVAL;
+                }
+
+                rc = numa_place_domain(gc, domid, info);
+                if (rc)
+                    return rc;
+                /* init vnodemap to numa automatic placement */
+                if (libxl__init_vnode_to_pnode(gc, domid, info) < 0) {
+                    LOG(DETAIL, "Failed to init vnodemap\n");
+                    /* vnuma_nodemap will not be used if nr_vnodes == 0 */
+                    return ERROR_FAIL;
+                }
+            }
+        } else {
+            if (libxl__vnodemap_is_usable(gc, info))
+                libxl_defbool_set(&info->numa_placement, false);
+          else {
+            LOG(ERROR, "The allocation mask for vnuma nodes cannot be 
used.\n");
+            return ERROR_FAIL;
+          }
         }
 
-        rc = numa_place_domain(gc, domid, info);
-        if (rc)
-            return rc;
+        if (xc_domain_setvnuma(ctx->xch, domid, info->nr_vnodes,
+                                info->max_vcpus, memrange,
+                                info->vdistance, info->vcpu_to_vnode,
+                                info->vnode_to_pnode) < 0) {
+           LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "Failed to set vnuma 
topology for domain from\n.");
+           return ERROR_FAIL;
+        }
     }
+
     libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap);
     libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap);
 
@@ -382,6 +484,28 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
         }
     }
 
+    if (info->nr_vnodes != 0) {
+        dom->vnode_to_pnode = (unsigned int *)malloc(
+                                info->nr_vnodes * 
sizeof(*info->vnode_to_pnode));
+        dom->vnuma_memszs = (uint64_t *)malloc(
+                              info->nr_vnodes * sizeof(*info->vnuma_memszs));
+
+        if ( dom->vnuma_memszs == NULL || dom->vnode_to_pnode == NULL ) {
+            info->nr_vnodes = 0;
+            if (dom->vnode_to_pnode) free(dom->vnode_to_pnode);
+            if (dom->vnuma_memszs) free(dom->vnuma_memszs);
+            goto out;
+        }
+
+        memcpy(dom->vnuma_memszs, info->vnuma_memszs,
+                sizeof(*info->vnuma_memszs) * info->nr_vnodes);
+        memcpy(dom->vnode_to_pnode, info->vnode_to_pnode,
+                sizeof(*info->vnode_to_pnode) * info->nr_vnodes);
+
+        dom->nr_vnodes = info->nr_vnodes;
+    } else
+        goto out;
+
     dom->flags = flags;
     dom->console_evtchn = state->console_port;
     dom->console_domid = state->console_domid;
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index a2d8247..c842763 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2888,6 +2888,10 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
     libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
 }
 
+int libxl__init_vnode_to_pnode(libxl__gc *gc, uint32_t domid,
+                                libxl_domain_build_info *info);
+
+
 /*
  * Inserts "elm_new" into the sorted list "head".
  *
@@ -2937,6 +2941,8 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
  */
 #define CTYPE(isfoo,c) (isfoo((unsigned char)(c)))
 
+unsigned int libxl__vnodemap_is_usable(libxl__gc *gc,
+                                libxl_domain_build_info *info);
 
 #endif
 
diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
index 20c99ac..68b53d7 100644
--- a/tools/libxl/libxl_numa.c
+++ b/tools/libxl/libxl_numa.c
@@ -19,6 +19,8 @@
 
 #include "libxl_internal.h"
 
+#include "libxl_vnuma.h"
+
 /*
  * What follows are helpers for generating all the k-combinations
  * without repetitions of a set S with n elements in it. Formally
@@ -500,6 +502,53 @@ int libxl__get_numa_candidate(libxl__gc *gc,
 }
 
 /*
+ * Checks if vnuma_nodemap defined in info can be used
+ * for allocation of vnodes on physical NUMA nodes by
+ * verifying that there is enough memory on corresponding
+ * NUMA nodes.
+ */
+unsigned int libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info 
*info)
+{
+    unsigned int i;
+    libxl_numainfo *ninfo = NULL;
+    unsigned long long *claim;
+    unsigned int node;
+    uint64_t *mems;
+    int rc, nr_nodes;
+
+    rc = nr_nodes = 0;
+
+    /*
+     * Cannot use specified mapping if not NUMA machine
+     */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL)
+        return rc;
+
+    mems = info->vnuma_memszs;
+    claim = libxl__calloc(gc, info->nr_vnodes, sizeof(*claim));
+    /* Sum memory request on per pnode basis */
+    for (i = 0; i < info->nr_vnodes; i++)
+    {
+        node = info->vnode_to_pnode[i];
+        /* Correct pnode number? */
+        if (node < nr_nodes)
+            claim[node] += (mems[i] << 20);
+        else
+            goto vmapu;
+   }
+   for (i = 0; i < nr_nodes; i++) {
+       if (claim[i] > ninfo[i].free)
+          /* Cannot complete user request, falling to default */
+          goto vmapu;
+   }
+   rc = 1;
+
+ vmapu:
+   return rc;
+}
+
+/*
  * Local variables:
  * mode: C
  * c-basic-offset: 4
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index e1c183f..35c4d67 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -1,5 +1,6 @@
 #include "libxl_internal.h"
 #include "libxl_arch.h"
+#include "libxl_vnuma.h"
 
 static const char *e820_names(int type)
 {
@@ -317,3 +318,125 @@ int libxl__arch_domain_configure(libxl__gc *gc,
 {
     return 0;
 }
+
+/*
+ * Used for PV guest with e802_host enabled and thus
+ * having non-contiguous e820 memory map.
+ */
+static unsigned long e820_memory_hole_size(unsigned long start,
+                                            unsigned long end,
+                                            struct e820entry e820[],
+                                            int nr)
+{
+    int i;
+    unsigned long absent, start_pfn, end_pfn;
+
+    absent = end - start;
+    for(i = 0; i < nr; i++) {
+        /* if not E820_RAM region, skip it and dont substract from absent */
+        if(e820[i].type == E820_RAM) {
+            start_pfn = e820[i].addr;
+            end_pfn =   e820[i].addr + e820[i].size;
+            /* beginning pfn is in this region? */
+            if (start >= start_pfn && start <= end_pfn) {
+                if (end > end_pfn)
+                    absent -= end_pfn - start;
+                else
+                    /* fit the region? then no absent pages */
+                    absent -= end - start;
+                continue;
+            }
+            /* found the end of range in this region? */
+            if (end <= end_pfn && end >= start_pfn) {
+                absent -= end - start_pfn;
+                /* no need to look for more ranges */
+                break;
+            }
+        }
+    }
+    return absent;
+}
+
+/*
+ * Checks for the beginnig and end of RAM in e820 map for domain
+ * and aligns start of first and end of last vNUMA memory block to
+ * that map. vnode memory size are passed here Megabytes.
+ * For PV guest e820 map has fixed hole sizes.
+ */
+int libxl__vnuma_align_mem(libxl__gc *gc,
+                            uint32_t domid,
+                            libxl_domain_build_info *b_info, /* IN: mem sizes 
*/
+                            vmemrange_t *memblks)        /* OUT: linux numa 
blocks in pfn */
+{
+    int i, j, rc;
+    uint64_t next_start_pfn, end_max = 0, size;//, mem_hole;
+    uint32_t nr;
+    struct e820entry map[E820MAX];
+
+    if (b_info->nr_vnodes == 0)
+        return -EINVAL;
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+
+    /* retreive e820 map for this host */
+    rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
+
+    if (rc < 0) {
+        errno = rc;
+        return -EINVAL;
+    }
+    nr = rc;
+    rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
+                       (b_info->max_memkb - b_info->target_memkb) +
+                       b_info->u.pv.slack_memkb);
+    if (rc)
+    {
+        errno = rc;
+        return -EINVAL;
+    }
+
+    /* max pfn for this host */
+    for (j = nr - 1; j >= 0; j--)
+        if (map[j].type == E820_RAM) {
+            end_max = map[j].addr + map[j].size;
+            break;
+        }
+
+    memset(memblks, 0, sizeof(*memblks) * b_info->nr_vnodes);
+    next_start_pfn = 0;
+
+    memblks[0].start = map[0].addr;
+
+    for(i = 0; i < b_info->nr_vnodes; i++) {
+        /* start can be not zero */
+        memblks[i].start += next_start_pfn;
+        memblks[i].end = memblks[i].start + (b_info->vnuma_memszs[i] << 20);
+        memblks[i]._reserved = 0;
+
+        size = memblks[i].end - memblks[i].start;
+        /*
+         * For pv host with e820_host option turned on we need
+         * to take into account memory holes. For pv host with
+         * e820_host disabled or unset, the map is contiguous
+         * RAM region.
+         */
+        if (libxl_defbool_val(b_info->u.pv.e820_host)) {
+            while((memblks[i].end - memblks[i].start -
+                   e820_memory_hole_size(memblks[i].start,
+                   memblks[i].end, map, nr)) < size )
+            {
+                memblks[i].end += MIN_VNODE_SIZE << 10;
+                if (memblks[i].end > end_max) {
+                    memblks[i].end = end_max;
+                    break;
+                }
+            }
+        }
+        next_start_pfn = memblks[i].end;
+        LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,"i %d, start  = %#010lx, end = 
%#010lx\n",
+                    i, memblks[i].start, memblks[i].end);
+    }
+    if (memblks[i-1].end > end_max)
+        memblks[i-1].end = end_max;
+
+    return 0;
+}
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.