[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 09 of 11] libxl, xl: enable automatic placement of guests on NUMA nodes



If a domain does not have a VCPU affinity, try to pin it automatically
to some PCPUs. This is done taking into account the NUMA characteristics
of the host: we look for a combination of host's NUMA nodes that has enough
free memoy for the new domain, and pin it to the VCPUs of those nodes.
Smaller combinations are considered first, to avoid spreading the
domain's memory among too many nodes.

After that, we also ensure that the chosen combination comprises at
least the same number of PCPUs than the number of VCPUs of the domain,
increasing the number of nodes it is made up of if that is not the case.
When adding new nodes to a combination, node distances are taken into
account, trying to minimize the peformance impact for the domain.

Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>

diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
--- a/docs/man/xl.cfg.pod.5
+++ b/docs/man/xl.cfg.pod.5
@@ -111,8 +111,8 @@ created online and the remainder will be
 
 =item B<cpus="CPU-LIST">
 
-List of which cpus the guest is allowed to use. Default behavior is
-`all cpus`. A C<CPU-LIST> may be specified as follows:
+List of which cpus the guest is allowed to use. By default xl will
+pick some cpus (see below). A C<CPU-LIST> may be specified as follows:
 
 =over 4
 
@@ -132,6 +132,12 @@ run on cpu #3 of the host.
 
 =back
 
+If this option is not specified, xl automatically try to place the new
+domain on the host's NUMA nodes (provided the host has more than one NUMA
+node) by pinning it to the cpus of those nodes. The employed heuristics
+tries to maximize performance for the domain itself, while also achieving
+efficient resource utilization.
+
 =item B<cpu_weight=WEIGHT>
 
 A domain with a weight of 512 will get twice as much CPU as a domain
diff --git a/tools/libxl/gentest.py b/tools/libxl/gentest.py
--- a/tools/libxl/gentest.py
+++ b/tools/libxl/gentest.py
@@ -131,6 +131,19 @@ static void libxl_cpumap_rand_init(libxl
     }
 }
 
+static void libxl_nodemap_rand_init(libxl_nodemap *nodemap)
+{
+    int i;
+    nodemap->size = rand() % 16;
+    nodemap->map = calloc(nodemap->size, sizeof(*nodemap->map));
+    libxl_for_each_node(i, *nodemap) {
+        if (rand() % 2)
+            libxl_nodemap_set(nodemap, i);
+        else
+            libxl_nodemap_reset(nodemap, i);
+    }
+}
+
 static void libxl_key_value_list_rand_init(libxl_key_value_list *pkvl)
 {
     int i, nr_kvp = rand() % 16;
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -626,6 +626,15 @@ libxl_cpupoolinfo * libxl_list_cpupool(l
 libxl_vminfo * libxl_list_vm(libxl_ctx *ctx, int *nb_vm);
 void libxl_vminfo_list_free(libxl_vminfo *list, int nr);
 
+/* Automatic NUMA placement */
+libxl_numa_candidate *libxl_domain_numa_candidates(libxl_ctx *ctx,
+                                        libxl_domain_build_info *b_info,
+                                        int min_nodes, int *nr_cndts);
+int libxl_numa_candidate_add_cpus(libxl_ctx *ctx,
+                                  int min_cpus, int max_nodes,
+                                  libxl_numa_candidate *candidate);
+void libxl_numa_candidates_list_free(libxl_numa_candidate *list, int nr);
+
 /*
  * Devices
  * =======
diff --git a/tools/libxl/libxl_json.c b/tools/libxl/libxl_json.c
--- a/tools/libxl/libxl_json.c
+++ b/tools/libxl/libxl_json.c
@@ -119,6 +119,26 @@ out:
     return s;
 }
 
+yajl_gen_status libxl_nodemap_gen_json(yajl_gen hand,
+                                       libxl_nodemap *nodemap)
+{
+    yajl_gen_status s;
+    int i;
+
+    s = yajl_gen_array_open(hand);
+    if (s != yajl_gen_status_ok) goto out;
+
+    libxl_for_each_node(i, *nodemap) {
+        if (libxl_nodemap_test(nodemap, i)) {
+            s = yajl_gen_integer(hand, i);
+            if (s != yajl_gen_status_ok) goto out;
+        }
+    }
+    s = yajl_gen_array_close(hand);
+out:
+    return s;
+}
+
 yajl_gen_status libxl_key_value_list_gen_json(yajl_gen hand,
                                               libxl_key_value_list *pkvl)
 {
diff --git a/tools/libxl/libxl_json.h b/tools/libxl/libxl_json.h
--- a/tools/libxl/libxl_json.h
+++ b/tools/libxl/libxl_json.h
@@ -27,6 +27,7 @@ yajl_gen_status libxl_domid_gen_json(yaj
 yajl_gen_status libxl_uuid_gen_json(yajl_gen hand, libxl_uuid *p);
 yajl_gen_status libxl_mac_gen_json(yajl_gen hand, libxl_mac *p);
 yajl_gen_status libxl_cpumap_gen_json(yajl_gen hand, libxl_cpumap *p);
+yajl_gen_status libxl_nodemap_gen_json(yajl_gen hand, libxl_nodemap *p);
 yajl_gen_status libxl_cpuid_policy_list_gen_json(yajl_gen hand,
                                                  libxl_cpuid_policy_list *p);
 yajl_gen_status libxl_string_list_gen_json(yajl_gen hand, libxl_string_list 
*p);
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -439,6 +439,12 @@ libxl_cputopology = Struct("cputopology"
     ("node", uint32),
     ], dir=DIR_OUT)
 
+libxl_numa_candidate = Struct("numa_candidate", [
+    ("nr_nodes", integer),
+    ("free_memkb", uint32),
+    ("nodemap", libxl_nodemap),
+    ], dir=DIR_OUT)
+
 libxl_sched_credit_domain = Struct("sched_credit_domain", [
     ("weight", integer),
     ("cap", integer),
diff --git a/tools/libxl/libxl_utils.c b/tools/libxl/libxl_utils.c
--- a/tools/libxl/libxl_utils.c
+++ b/tools/libxl/libxl_utils.c
@@ -613,6 +613,250 @@ int libxl__enum_from_string(const libxl_
     return ERROR_FAIL;
 }
 
+static uint64_t node_to_nodemap_distance(const libxl_numainfo *ninfo,
+                                         libxl_nodemap *nodemap,
+                                         int node)
+{
+    int i;
+    uint64_t dist = 0;
+
+    /* Sum of the distances of a node from all the nodes in the map */
+    libxl_for_each_set_node(i, *nodemap)
+        dist += labs(ninfo[node].dists[i]);
+
+    return dist;
+}
+
+static int add_node_to_nodemap(const libxl_numainfo *ninfo, int nr_nodes,
+                               libxl_nodemap *nodemap)
+{
+    int i, node = -1;
+    uint64_t min_dist = UINT64_MAX;
+
+    /* Find the node closest to our nodemap */
+    for (i = 0; i < nr_nodes && !libxl_nodemap_test(nodemap, i); i++) {
+        if (node_to_nodemap_distance(ninfo, nodemap, i) < min_dist) {
+            min_dist = node_to_nodemap_distance(ninfo, nodemap, i);
+            node = i;
+        }
+    }
+    if (node >= 0)
+        libxl_nodemap_set(nodemap, node);
+
+    return node;
+}
+
+/*
+ * The following two uility functions compute the node maps
+ * coresponding to the [ n! / (k! * (n - k)!) ] combinations
+ * of @size nodes within a set that is @nr_nodes big, without
+ * repetition. The algorithm used for generating them uses
+ * a vector containing the indexes in the set of the elements
+ * of the i-eth combination to generate the (i+1)-eth, and
+ * ensures they come in lexicographic order.
+ *
+ * For example, with n = 5 and k = 3, calling numa_node_set_init()
+ * will generate "0, 1, 2", while subsequent calls to
+ * numa_node_set_next() yield the following:
+ * "0, 1, 2
+ *  0, 1, 3
+ *  0, 1, 4
+ *  0, 2, 3
+ *  0, 2, 4
+ *  0, 3, 4
+ *  1, 2, 3
+ *  1, 2, 4
+ *  1, 3, 4
+ *  2, 3, 4"
+ */
+static void numa_node_set_init(int nr_nodes, int size, int *idxs,
+                              libxl_nodemap *nodemap)
+{
+    int i;
+
+    /* First set always is "0, 1, 2, ..., size-1" */
+    libxl_nodemap_set_none(nodemap);
+    for (i = 0; i < size; i++) {
+        libxl_nodemap_set(nodemap, i);
+        idxs[i] = i;
+    }
+}
+
+static int numa_node_set_next(int nr_nodes, int size, int *idxs,
+                              libxl_nodemap *nodemap)
+{
+    int i;
+
+    /* Check whether we just need to increase the rightmost index */
+    if (idxs[size - 1] < nr_nodes - 1) {
+        idxs[size - 1]++;
+        goto build_nodemap;
+    }
+
+    /* Find the rightmost element from where to start the increasing */
+    for (i = size - 1; idxs[i] == nr_nodes - size + i; i--) {
+        if (i <= 0) {
+            /* No more valid combinations! */
+            return -1;
+        }
+    }
+    for (idxs[i]++, i += 1; i < size; i++)
+        idxs[i] = idxs[i - 1] + 1;
+
+ build_nodemap:
+    libxl_nodemap_set_none(nodemap);
+    for (i = 0; i < size; i++)
+        libxl_nodemap_set(nodemap, idxs[i]);
+
+    return 0;
+}
+
+libxl_numa_candidate *libxl_domain_numa_candidates(libxl_ctx *ctx,
+                                        libxl_domain_build_info *b_info,
+                                        int min_nodes, int *nr_cndts)
+{
+    uint32_t dom_memkb, nodes_memkb;
+    libxl_numa_candidate *cndts = NULL;
+    libxl_nodemap suitable_nodes;
+    int *suitable_nodes_idx;
+    libxl_numainfo *ninfo;
+    int nr_nodes, i, next_comb_ok;
+
+    ninfo = libxl_get_numainfo(ctx, &nr_nodes);
+    if (ninfo == NULL) {
+        LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "libxl_get_numainfo failed.\n");
+        goto out;
+    }
+
+    /* Prepare the nodemap and the index array for the combinations */
+    if (libxl_nodemap_alloc(ctx, &suitable_nodes) < 0) {
+        LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "libxl_nodemap_alloc failed\n");
+        goto out_numainfo;
+    }
+    suitable_nodes_idx = malloc(sizeof(int) * nr_nodes);
+    if (suitable_nodes_idx == NULL) {
+        LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "malloc failed\n");
+        goto out_nodemap;
+    }
+
+    /* Memoy needed by the domain */
+    if (libxl_domain_need_memory(ctx, b_info, &dom_memkb)) {
+        LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "libxl_domain_need_memory failed\n");
+        goto out_nodemap_idx;
+    }
+
+    *nr_cndts = 0;
+    while (1) {
+        /* Avoid considering a silly number of nodes */
+        if (min_nodes > nr_nodes)
+            break;
+
+        /* Generate the combinations and check their cumulative free memory */
+        numa_node_set_init(nr_nodes, min_nodes, suitable_nodes_idx,
+                           &suitable_nodes);
+        do {
+            nodes_memkb = 0;
+            libxl_for_each_set_node(i, suitable_nodes)
+                nodes_memkb += ninfo[i].free / 1024;
+
+            /* If free memory is enough, this is a valid candidate */
+            if (nodes_memkb >= dom_memkb) {
+                cndts = realloc(cndts, sizeof(cndts[0]) * (*nr_cndts+1));
+                if (cndts == NULL) {
+                    LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "realloc failed\n");
+                    goto out_nodemap_idx;
+                }
+
+                libxl_nodemap_alloc(ctx, &cndts[*nr_cndts].nodemap);
+                libxl_nodemap_copy(&cndts[*nr_cndts].nodemap, &suitable_nodes);
+                cndts[*nr_cndts].free_memkb = nodes_memkb;
+                cndts[*nr_cndts].nr_nodes = min_nodes;
+                (*nr_cndts)++;
+            }
+
+            next_comb_ok = numa_node_set_next(nr_nodes, min_nodes,
+                                              suitable_nodes_idx,
+                                              &suitable_nodes) == 0;
+        } while (next_comb_ok);
+
+        min_nodes++;
+    }
+
+ out_nodemap_idx:
+    free(suitable_nodes_idx);
+ out_nodemap:
+    libxl_nodemap_dispose(&suitable_nodes);
+ out_numainfo:
+    libxl_numainfo_list_free(ninfo, nr_nodes);
+ out:
+    return cndts;
+}
+
+int libxl_numa_candidate_add_cpus(libxl_ctx *ctx,
+                                  int min_cpus, int max_nodes,
+                                  libxl_numa_candidate *candidate)
+{
+    int dom_nodes, nodes_cpus;
+    libxl_cputopology *tinfo;
+    libxl_numainfo *ninfo;
+    int nr_nodes, nr_cpus;
+    int i, rc;
+
+    tinfo = libxl_get_cpu_topology(ctx, &nr_cpus);
+    if (tinfo == NULL) {
+        LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "libxl_get_topologyinfo failed\n");
+        rc = ERROR_NOMEM;
+        goto out;
+    }
+
+    ninfo = libxl_get_numainfo(ctx, &nr_nodes);
+    if (ninfo == NULL) {
+        LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "libxl_get_numainfo failed\n");
+        rc = ERROR_NOMEM;
+        goto out_topologyinfo;
+    }
+
+    if (max_nodes == -1 || max_nodes > nr_nodes)
+        max_nodes = nr_nodes;
+
+    /* If a candidate is short in PCPUs, just add more nodes */
+    dom_nodes = candidate->nr_nodes;
+    while (1) {
+        rc = ERROR_FAIL;
+
+        if (candidate->nr_nodes > max_nodes)
+            break;
+
+        for (i = 0, nodes_cpus = 0; i < nr_cpus; i++) {
+            if (libxl_nodemap_test(&candidate->nodemap, tinfo[i].node))
+                nodes_cpus++;
+        }
+
+        /* Either we have enough PCPUs, and thus we're done ... */
+        if (nodes_cpus >= min_cpus) {
+            rc = 0;
+            break;
+        }
+        /* ... Or we need more, and we want to try adding the closest node */
+        add_node_to_nodemap(ninfo, nr_nodes, &candidate->nodemap);
+        candidate->nr_nodes++;
+    }
+
+    libxl_numainfo_list_free(ninfo, nr_nodes);
+ out_topologyinfo:
+    libxl_cputopology_list_free(tinfo, nr_cpus);
+ out:
+    return rc;
+}
+
+void libxl_numa_candidates_list_free(libxl_numa_candidate *list, int nr)
+{
+    int i;
+    for (i = 0; i < nr; i++)
+        libxl_numa_candidate_dispose(&list[i]);
+    free(list);
+}
+
 void libxl_numainfo_list_free(libxl_numainfo *list, int nr)
 {
     int i;
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -490,6 +490,122 @@ static void split_string_into_string_lis
     free(s);
 }
 
+/* How many PCPUs are there on each node? */
+static int cpus_per_node(libxl_cputopology *tinfo, int nr_cpus)
+{
+    libxl_numainfo *ninfo;
+    int nr_nodes, j, i;
+    int cpus_nodes = 0;
+
+    ninfo = libxl_get_numainfo(ctx, &nr_nodes);
+    if (ninfo == NULL)
+        return 0;
+
+    /* This makes sense iff # of PCPUs is the same for all nodes */
+    for (j = 0; j < nr_nodes; j++) {
+        int curr_cpus = 0;
+
+        for (i = 0; i < nr_cpus; i++) {
+            if (tinfo[i].node == j)
+                curr_cpus++;
+        }
+        cpus_nodes = cpus_nodes == 0 ? curr_cpus : cpus_nodes;
+
+        /* Make sure the above holds, or turn the whole thing off! */
+        if (cpus_nodes != curr_cpus) {
+            cpus_nodes = 0;
+            break;
+        }
+    }
+
+    libxl_numainfo_dispose(ninfo);
+    return cpus_nodes;
+}
+
+/* Try to achieve "optimal" NUMA placement */
+static int place_domain(libxl_domain_build_info *b_info)
+{
+    int dom_min_nodes, dom_max_nodes;
+    int dom_needs_cpus, cpus_nodes;
+    libxl_cputopology *tinfo;
+    libxl_cpupoolinfo *pinfo;
+    int nr_cpus, nr_pools;
+    libxl_numa_candidate *candidates;
+    int candidate, nr_candidates;
+    int i, err;
+
+    /* First of all, if the domain has its cpu-affinity, just don't bother */
+    if (libxl_cpumap_is_full(&b_info->cpumap) != 0)
+        return 0;
+
+    /* If cpupools are in use, better not to mess with them */
+    pinfo = libxl_list_cpupool(ctx, &nr_pools);
+    if (!pinfo) {
+        fprintf(stderr, "error getting cpupool info\n");
+        err = ENOMEM;
+        goto out;
+    }
+    if (nr_pools > 1) {
+        fprintf(stderr, "skip numa placement as cpupools are being used\n");
+        err = 0;
+        goto out_poolinfo;
+    }
+
+    tinfo = libxl_get_cpu_topology(ctx, &nr_cpus);
+    if (tinfo == NULL) {
+        fprintf(stderr, "libxl_get_topologyinfo failed.\n");
+        err = ENOMEM;
+        goto out_poolinfo;
+    }
+
+    /* If possible, start with a sensible minimum number of nodes */
+    cpus_nodes = cpus_per_node(tinfo, nr_cpus);
+    dom_needs_cpus = b_info->max_vcpus;
+
+    if (cpus_nodes != 0)
+        dom_min_nodes = (dom_needs_cpus + cpus_nodes - 1) / cpus_nodes;
+    else
+        dom_min_nodes = 1;
+
+    /* Let's fit the domain memory-wise (ENOSPC if we don't manage) */
+    candidates = libxl_domain_numa_candidates(ctx, b_info, dom_min_nodes,
+                                              &nr_candidates);
+    if (!candidates) {
+        err = ENOSPC;
+        goto out_topologyinfo;
+    }
+
+    /* Pick a candidate and ensure it gives us enough PCPUs */
+    dom_max_nodes = -1; err = ERROR_FAIL;
+    for (candidate = 0; err && candidate < nr_candidates; candidate++) {
+        /* We try adding nodes up to all of them (dom_max_nodes = -1) */
+        err = libxl_numa_candidate_add_cpus(ctx, dom_needs_cpus, dom_max_nodes,
+                                            &candidates[candidate]);
+    }
+
+    if (err == ERROR_FAIL) {
+        /* Report back we didn't find a candidate with enough cpus */
+        err = ESRCH;
+    } else if (!err) {
+        /* Things went fine. Commit the map into domain's build info */
+        libxl_cpumap_set_none(&b_info->cpumap);
+        for (i = 0; i < nr_cpus; i++) {
+            if (libxl_nodemap_test(&candidates[candidate-1].nodemap,
+                                   tinfo[i].node))
+                libxl_cpumap_set(&b_info->cpumap, i);
+        }
+    }
+
+    libxl_numa_candidates_list_free(candidates, nr_candidates);
+out_topologyinfo:
+    libxl_cputopology_list_free(tinfo, nr_cpus);
+out_poolinfo:
+    for (i = 0; i < nr_pools; i++)
+        libxl_cpupoolinfo_dispose(&pinfo[i]);
+out:
+    return err;
+}
+
 static int vcpupin_parse(char *cpu, libxl_cpumap *cpumap)
 {
     libxl_cpumap exclude_cpumap;
@@ -1738,6 +1854,16 @@ start:
         goto error_out;
     }
 
+    ret = place_domain(&d_config.b_info);
+    if (ret == ESRCH || ret == ENOSPC) {
+        fprintf(stderr, "Failed to place the domain, fallback to all nodes\n");
+        /* Resetting domain's cpu-affinity is enough for that */
+        libxl_cpumap_set_any(&d_config.b_info.cpumap);
+    } else if (ret != 0) {
+        ret = ERROR_FAIL;
+        goto error_out;
+    }
+
     libxl_asyncprogress_how autoconnect_console_how_buf;
     if ( dom_info->console_autoconnect ) {
         autoconnect_console_how_buf.callback = autoconnect_console;
diff --git a/tools/python/xen/lowlevel/xl/xl.c 
b/tools/python/xen/lowlevel/xl/xl.c
--- a/tools/python/xen/lowlevel/xl/xl.c
+++ b/tools/python/xen/lowlevel/xl/xl.c
@@ -320,6 +320,23 @@ PyObject *attrib__libxl_file_reference_g
     return genwrap__string_get(&pptr->path);
 }
 
+PyObject *attrib__libxl_nodemap_get(libxl_nodemap *pptr)
+{
+    PyObject *nodelist = NULL;
+    int i;
+
+    nodelist = PyList_New(0);
+    libxl_for_each_node(i, *pptr) {
+        if ( libxl_nodemap_test(pptr, i) ) {
+            PyObject* pyint = PyInt_FromLong(i);
+
+            PyList_Append(nodelist, pyint);
+            Py_DECREF(pyint);
+        }
+    }
+    return nodelist;
+}
+
 PyObject *attrib__libxl_hwcap_get(libxl_hwcap *pptr)
 {
     PyErr_SetString(PyExc_NotImplementedError, "Getting hwcap");

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.