[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v7 9/9] libxl: vnuma topology configuration parser and doc



Parses vnuma topoplogy number of nodes and memory
ranges. If not defined, initializes vnuma with
only one node and default topology. This one node covers
all domain memory and all vcpus assigned to it.

Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
---
 docs/man/xl.cfg.pod.5    |   77 +++++++++
 tools/libxl/xl_cmdimpl.c |  425 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 502 insertions(+)

diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
index 1e04eed..adc52b4 100644
--- a/docs/man/xl.cfg.pod.5
+++ b/docs/man/xl.cfg.pod.5
@@ -264,6 +264,83 @@ if the values of B<memory=> and B<maxmem=> differ.
 A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver
 it will crash.
 
+=item B<vnuma_nodes=N>
+
+Number of vNUMA nodes the guest will be initialized with on boot.
+PV guest by default will have one vnuma node.
+
+=item B<vnuma_mem=[vmem1, vmem2, ...]>
+
+List of memory sizes for each node, defined in MBytes. Number of items listed 
must 
+match nr_vnodes. If the sum of all vnode memories does not match the domain 
memory
+or there are missing nodes, it will fail.
+If not specified, memory will be equally split between vnodes. Current minimum
+memory size for one node is limited by 32MB.
+
+Example: vnuma_mem=[1024, 1024, 2048, 2048]
+Total amount of memory in guest: 6GB
+
+=item B<vdistance=[d1, d2]>
+
+Defines the distance table for vNUMA nodes. NUMA topology distances are
+represented by two dimensional square matrix. One element of it [i,j] is
+a distance between nodes i and j. Trivial case is where all diagonal elements
+are equal and matrix is symmetrical. vdistance configuration option allows
+to define two values d1 and d2. d1 will be used for all diagonal elements of
+distance matrix. All other values will be equal to d2 value. Usually distances
+are multiple of 10 in Linux and same rule used here.
+If not specified, the default constants values will be used for distance,
+e.g. [10, 20]. For one node default distance is [10];
+
+Examples:
+vnodes = 3
+vdistance=[10, 20]
+will create this distance table (this is default setting as well):
+[10, 20, 20]
+[20, 10, 20]
+[20, 20, 10]
+
+=item B<vnuma_vcpumap=[node_nr, node_nr, ...]>
+
+Defines vcpu to vnode mapping as a list of integers. The position in the list
+is a vcpu number, and the value is the vnode number to which the vcpu will be
+assigned to.
+Current limitations:
+- vNUMA node must have at least one vcpu, otherwise default vcpu_to_vnode will 
be used.
+- Total number of vnodes cannot be bigger then number of vcpus.
+
+Example:
+Map of 4 vcpus to 2 vnodes:
+0,1 vcpu -> vnode0
+2,3 vcpu -> vnode1:
+
+vnuma_vcpumap = [0, 0, 1, 1]
+ 4 vcpus here -  0  1  2  3
+
+=item B<vnuma_vnodemap=[p1, p2, ..., pn]>
+
+List of physical node numbers, position in the list represents vnode number.
+Used for manual placement of vnuma nodes to physical NUMA nodes.
+Will not be used if automatic numa placement is active.
+
+Example:
+assume NUMA machine with 4 physical nodes. Placing vnuma node 0 to pnode 2,
+vnuma node 1 to pnode 3:
+vnode0 -> pnode2
+vnode1 -> pnode3
+
+vnuma_vnodemap=[2, 3]
+first vnode will be placed on node 2, second on node 3.
+
+=item B<vnuma_autoplacement=[0|1]>
+
+If set to 1 and automatic NUMA placement is enabled, automatically will find 
the best
+physical node to place vnuma nodes on. vnuma_vnodemap will be ignored. 
Automatic NUMA
+placement is enabled if domain has no pinned cpus.
+If vnuma_autoplacement is set to 0, then the vnodes will be placed on NUMA 
nodes set
+in vnuma_vnodemap if there is enough memory on physical nodes. If not, then 
the allocation
+will be made on any of the available node and be placed on multiple physical 
NUMA nodes.
+
 =back
 
 =head3 Event Actions
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index f1c136a..6a4d657 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -40,6 +40,7 @@
 #include "libxl_json.h"
 #include "libxlutil.h"
 #include "xl.h"
+#include "libxl_vnuma.h"
 
 /* For calls which return an errno on failure */
 #define CHK_ERRNOVAL( call ) ({                                         \
@@ -766,6 +767,423 @@ static void parse_vcpu_affinity(libxl_domain_build_info 
*b_info,
     }
 }
 
+static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
+{
+    const char *buf;
+    char *ep;
+    unsigned long ul;
+    int rc = -EINVAL;
+
+    buf = xlu_cfg_get_listitem(list, i);
+    if (!buf)
+        return rc;
+    ul = strtoul(buf, &ep, 10);
+    if (ep == buf)
+        return rc;
+    if (ul >= UINT16_MAX)
+        return rc;
+    return (unsigned int)ul;
+}
+
+static void vdistance_set(unsigned int *vdistance,
+                                unsigned int nr_vnodes,
+                                unsigned int samenode,
+                                unsigned int othernode)
+{
+    unsigned int idx, slot;
+    for (idx = 0; idx < nr_vnodes; idx++)
+        for (slot = 0; slot < nr_vnodes; slot++)
+            *(vdistance + slot * nr_vnodes + idx) =
+                idx == slot ? samenode : othernode;
+}
+
+static void vcputovnode_default(unsigned int *cpu_to_node,
+                                unsigned int nr_vnodes,
+                                unsigned int max_vcpus)
+{
+    unsigned int cpu;
+    for (cpu = 0; cpu < max_vcpus; cpu++)
+        cpu_to_node[cpu] = cpu % nr_vnodes;
+}
+
+/* Split domain memory between vNUMA nodes equally. */
+static int split_vnumamem(libxl_domain_build_info *b_info)
+{
+    unsigned long long vnodemem = 0;
+    unsigned long n;
+    unsigned int i;
+
+    if (b_info->vnodes == 0)
+        return -1;
+
+    vnodemem = (b_info->max_memkb >> 10) / b_info->vnodes;
+    if (vnodemem < MIN_VNODE_SIZE)
+        return -1;
+    /* reminder in MBytes. */
+    n = (b_info->max_memkb >> 10) % b_info->vnodes;
+    /* get final sizes in MBytes. */
+    for (i = 0; i < (b_info->vnodes - 1); i++)
+        b_info->vnuma_mem[i] = vnodemem;
+    /* add the reminder to the last node. */
+    b_info->vnuma_mem[i] = vnodemem + n;
+    return 0;
+}
+
+static void vnuma_vnodemap_default(unsigned int *vnuma_vnodemap,
+                                   unsigned int nr_vnodes)
+{
+    unsigned int i;
+    for (i = 0; i < nr_vnodes; i++)
+        vnuma_vnodemap[i] = VNUMA_NO_NODE;
+}
+
+/*
+ * init vNUMA to "zero config" with one node and all other
+ * topology parameters set to default.
+ */
+static int vnuma_default_config(libxl_domain_build_info *b_info)
+{
+    b_info->vnodes = 1;
+    /* all memory goes to this one vnode, as well as vcpus. */
+    if (!(b_info->vnuma_mem = (uint64_t *)calloc(b_info->vnodes,
+                                sizeof(*b_info->vnuma_mem))))
+        goto bad_vnumazerocfg;
+
+    if (!(b_info->vnuma_vcpumap = (unsigned int *)calloc(b_info->max_vcpus,
+                                sizeof(*b_info->vnuma_vcpumap))))
+        goto bad_vnumazerocfg;
+
+    if (!(b_info->vdistance = (unsigned int *)calloc(b_info->vnodes *
+                                b_info->vnodes, sizeof(*b_info->vdistance))))
+        goto bad_vnumazerocfg;
+
+    if (!(b_info->vnuma_vnodemap = (unsigned int *)calloc(b_info->vnodes,
+                                sizeof(*b_info->vnuma_vnodemap))))
+        goto bad_vnumazerocfg;
+
+    b_info->vnuma_mem[0] = b_info->max_memkb >> 10;
+
+    /* all vcpus assigned to this vnode. */
+    vcputovnode_default(b_info->vnuma_vcpumap, b_info->vnodes,
+                        b_info->max_vcpus);
+
+    /* default vdistance is 10. */
+    vdistance_set(b_info->vdistance, b_info->vnodes, 10, 10);
+
+    /* VNUMA_NO_NODE for vnode_to_pnode. */
+    vnuma_vnodemap_default(b_info->vnuma_vnodemap, b_info->vnodes);
+
+    /*
+     * will be placed to some physical nodes defined by automatic
+     * numa placement or VNUMA_NO_NODE will not request exact node.
+     */
+    libxl_defbool_set(&b_info->vnuma_autoplacement, true);
+    return 0;
+
+ bad_vnumazerocfg:
+    return -1;
+}
+
+static void free_vnuma_info(libxl_domain_build_info *b_info)
+{
+    free(b_info->vnuma_mem);
+    free(b_info->vdistance);
+    free(b_info->vnuma_vcpumap);
+    free(b_info->vnuma_vnodemap);
+    b_info->vnodes = 0;
+}
+
+static int parse_vnuma_mem(XLU_Config *config,
+                            libxl_domain_build_info **b_info)
+{
+    libxl_domain_build_info *dst;
+    XLU_ConfigList *vnumamemcfg;
+    int nr_vnuma_regions, i;
+    unsigned long long vnuma_memparsed = 0;
+    unsigned long ul;
+    const char *buf;
+
+    dst = *b_info;
+    if (!xlu_cfg_get_list(config, "vnuma_mem",
+                          &vnumamemcfg, &nr_vnuma_regions, 0)) {
+
+        if (nr_vnuma_regions != dst->vnodes) {
+            fprintf(stderr, "Number of numa regions (vnumamem = %d) is \
+                    incorrect (should be %d).\n", nr_vnuma_regions,
+                    dst->vnodes);
+            goto bad_vnuma_mem;
+        }
+
+        dst->vnuma_mem = calloc(dst->vnodes,
+                                 sizeof(*dst->vnuma_mem));
+        if (dst->vnuma_mem == NULL) {
+            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
+            goto bad_vnuma_mem;
+        }
+
+        char *ep;
+        /*
+         * Will parse only nr_vnodes times, even if we have more/less regions.
+         * Take care of it later if less or discard if too many regions.
+         */
+        for (i = 0; i < dst->vnodes; i++) {
+            buf = xlu_cfg_get_listitem(vnumamemcfg, i);
+            if (!buf) {
+                fprintf(stderr,
+                        "xl: Unable to get element %d in vnuma memory 
list.\n", i);
+                goto bad_vnuma_mem;
+            }
+
+            ul = strtoul(buf, &ep, 10);
+            if (ep == buf) {
+                fprintf(stderr, "xl: Invalid argument parsing vnumamem: 
%s.\n", buf);
+                goto bad_vnuma_mem;
+            }
+
+            /* 32Mb is a min size for a node, taken from Linux */
+            if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
+                fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u 
range.\n",
+                        ul, MIN_VNODE_SIZE, UINT32_MAX);
+                goto bad_vnuma_mem;
+            }
+
+            /* memory in MBytes */
+            dst->vnuma_mem[i] = ul;
+        }
+
+        /* Total memory for vNUMA parsed to verify */
+        for (i = 0; i < nr_vnuma_regions; i++)
+            vnuma_memparsed = vnuma_memparsed + (dst->vnuma_mem[i]);
+
+        /* Amount of memory for vnodes same as total? */
+        if ((vnuma_memparsed << 10) != (dst->max_memkb)) {
+            fprintf(stderr, "xl: vnuma memory is not the same as domain \
+                    memory size.\n");
+            goto bad_vnuma_mem;
+        }
+    } else {
+        dst->vnuma_mem = calloc(dst->vnodes,
+                                      sizeof(*dst->vnuma_mem));
+        if (dst->vnuma_mem == NULL) {
+            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
+            goto bad_vnuma_mem;
+        }
+
+        fprintf(stderr, "WARNING: vNUMA memory ranges were not specified.\n");
+        fprintf(stderr, "Using default equal vnode memory size %lu Kbytes \
+                to cover %lu Kbytes.\n",
+                dst->max_memkb / dst->vnodes, dst->max_memkb);
+
+        if (split_vnumamem(dst) < 0) {
+            fprintf(stderr, "Could not split vnuma memory into equal 
chunks.\n");
+            goto bad_vnuma_mem;
+        }
+    }
+    return 0;
+
+ bad_vnuma_mem:
+    return -1;
+}
+
+static int parse_vnuma_distance(XLU_Config *config,
+                                libxl_domain_build_info **b_info)
+{
+    libxl_domain_build_info *dst;
+    XLU_ConfigList *vdistancecfg;
+    int nr_vdist;
+
+    dst = *b_info;
+    dst->vdistance = calloc(dst->vnodes * dst->vnodes,
+                               sizeof(*dst->vdistance));
+    if (dst->vdistance == NULL)
+        goto bad_distance;
+
+    if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0)) {
+        int d1, d2, i;
+        /*
+         * First value is the same node distance, the second as the
+         * rest of distances. The following is required right now to
+         * avoid non-symmetrical distance table as it may break latest kernel.
+         * TODO: Better way to analyze extended distance table, possibly
+         * OS specific.
+         */
+
+        for (i = 0; i < nr_vdist; i++) {
+            d1 = get_list_item_uint(vdistancecfg, i);
+        }
+
+        d1 = get_list_item_uint(vdistancecfg, 0);
+        if (dst->vnodes > 1)
+           d2 = get_list_item_uint(vdistancecfg, 1);
+        else
+           d2 = d1;
+
+        if (d1 >= 0 && d2 >= 0) {
+            if (d1 < d2)
+                fprintf(stderr, "WARNING: vnuma distance d1 < d2, %u < %u\n", 
d1, d2);
+            vdistance_set(dst->vdistance, dst->vnodes, d1, d2);
+        } else {
+            fprintf(stderr, "WARNING: vnuma distance values are incorrect.\n");
+            goto bad_distance;
+        }
+    } else {
+        fprintf(stderr, "Could not parse vnuma distances.\n");
+        vdistance_set(dst->vdistance, dst->vnodes, 10, 20);
+    }
+    return 0;
+
+ bad_distance:
+    return -1;
+}
+
+static int parse_vnuma_vcpumap(XLU_Config *config,
+                                libxl_domain_build_info **b_info)
+{
+    libxl_domain_build_info *dst;
+    XLU_ConfigList *vcpumap;
+    int nr_vcpumap, i;
+
+    dst = *b_info;
+    dst->vnuma_vcpumap = (unsigned int *)calloc(dst->max_vcpus,
+                                     sizeof(*dst->vnuma_vcpumap));
+    if (dst->vnuma_vcpumap == NULL)
+        goto bad_vcpumap;
+
+    if (!xlu_cfg_get_list(config, "vnuma_vcpumap",
+                          &vcpumap, &nr_vcpumap, 0)) {
+        if (nr_vcpumap == dst->max_vcpus) {
+            unsigned int  vnode, vcpumask = 0, vmask;
+            vmask = ~(~0 << nr_vcpumap);
+            for (i = 0; i < nr_vcpumap; i++) {
+                vnode = get_list_item_uint(vcpumap, i);
+                if (vnode >= 0 && vnode < dst->vnodes) {
+                    vcpumask  |= (1 << i);
+                    dst->vnuma_vcpumap[i] = vnode;
+                }
+            }
+
+            /* Did it covered all vnodes in the vcpu mask? */
+            if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) {
+                fprintf(stderr, "WARNING: Not all vnodes were covered \
+                        in numa_cpumask.\n");
+                goto bad_vcpumap;
+            }
+        } else {
+            fprintf(stderr, "WARNING:  Bad vnuma_vcpumap.\n");
+            goto bad_vcpumap;
+        }
+    }
+    else
+        vcputovnode_default(dst->vnuma_vcpumap,
+                            dst->vnodes,
+                            dst->max_vcpus);
+    return 0;
+
+ bad_vcpumap:
+    return -1;
+}
+
+static int parse_vnuma_vnodemap(XLU_Config *config,
+                                libxl_domain_build_info **b_info)
+{
+    libxl_domain_build_info *dst;
+    XLU_ConfigList *vnodemap;
+    int nr_vnodemap, i;
+
+    dst = *b_info;
+
+    /* There is mapping to NUMA physical nodes? */
+    dst->vnuma_vnodemap = (unsigned int *)calloc(dst->vnodes,
+                                    sizeof(*dst->vnuma_vnodemap));
+    if (dst->vnuma_vnodemap == NULL)
+        goto bad_vnodemap;
+    if (!xlu_cfg_get_list(config, "vnuma_vnodemap",&vnodemap,
+                                            &nr_vnodemap, 0)) {
+        /*
+        * If not specified or incorred, will be defined
+        * later based on the machine architecture, configuration
+        * and memory availble when creating domain.
+        */
+        libxl_defbool_set(&dst->vnuma_autoplacement, false);
+        if (nr_vnodemap == dst->vnodes) {
+            unsigned int vnodemask = 0, pnode, smask;
+            smask = ~(~0 << dst->vnodes);
+            for (i = 0; i < dst->vnodes; i++) {
+                pnode = get_list_item_uint(vnodemap, i);
+                if (pnode >= 0) {
+                    vnodemask |= (1 << i);
+                    dst->vnuma_vnodemap[i] = pnode;
+                }
+            }
+
+            /* Did it covered all vnodes in the mask? */
+            if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) {
+                fprintf(stderr, "WARNING: Not all vnodes were covered \
+                        vnuma_vnodemap.\n");
+                fprintf(stderr, "Automatic placement will be used for 
vnodes.\n");
+                libxl_defbool_set(&dst->vnuma_autoplacement, true);
+                vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
+            }
+        }
+        else {
+            fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n");
+            fprintf(stderr, "Automatic placement will be used for vnodes.\n");
+            libxl_defbool_set(&dst->vnuma_autoplacement, true);
+            vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
+        }
+    }
+    else {
+        fprintf(stderr, "WARNING: Missing vnuma_vnodemap.\n");
+        fprintf(stderr, "Automatic placement will be used for vnodes.\n");
+        libxl_defbool_set(&dst->vnuma_autoplacement, true);
+        vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
+    }
+    return 0;
+
+ bad_vnodemap:
+    return -1;
+
+}
+
+static void parse_vnuma_config(XLU_Config *config,
+                               libxl_domain_build_info *b_info)
+{
+    long l;
+
+    if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
+        if (l > MAX_VNUMA_NODES) {
+            fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n",
+                    MAX_VNUMA_NODES);
+            goto bad_vnuma_config;
+        }
+        b_info->vnodes = l;
+
+        if (!xlu_cfg_get_defbool(config, "vnuma_autoplacement",
+                    &b_info->vnuma_autoplacement, 0))
+            libxl_defbool_set(&b_info->vnuma_autoplacement, false);
+
+        /* Only construct nodes with at least one vcpu. */
+        if (b_info->vnodes != 0 && b_info->max_vcpus >= b_info->vnodes) {
+            if (parse_vnuma_mem(config, &b_info) ||
+                parse_vnuma_distance(config, &b_info) ||
+                parse_vnuma_vcpumap(config, &b_info) ||
+                parse_vnuma_vnodemap(config, &b_info))
+                goto bad_vnuma_config;
+        }
+        else if (vnuma_default_config(b_info))
+            goto bad_vnuma_config;
+    }
+    /* If vnuma topology is not defined for domain, init one node */
+    else if (vnuma_default_config(b_info))
+            goto bad_vnuma_config;
+    return;
+
+ bad_vnuma_config:
+    fprintf(stderr, "Failed to set vnuma config :(\n");
+    free_vnuma_info(b_info);
+    exit(1);
+}
+
 static void parse_config_data(const char *config_source,
                               const char *config_data,
                               int config_len,
@@ -1063,6 +1481,13 @@ static void parse_config_data(const char *config_source,
             exit(1);
         }
 
+
+        /*
+         * If there is no vnuma in config, "zero" vnuma config
+         * will be initialized with one node and other defaults.
+         */
+        parse_vnuma_config(config, b_info);
+
         xlu_cfg_replace_string (config, "bootloader", 
&b_info->u.pv.bootloader, 0);
         switch (xlu_cfg_get_list_as_string_list(config, "bootloader_args",
                                       &b_info->u.pv.bootloader_args, 1))
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.