[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 10 of 11] libxl, xl: heuristics for reordering NUMA placement candidates
Once we know which ones of all the possible combinations represents valid placement candidates for a domain, use some heuistics for deciding which one to pick (instead to just taking the first one). First of all, the smaller candidates are better both from the domain's point of view (fewer memory spreading among nodes) and fom the system point of view (fewer memoy fragmentation). In case of candidates of equal sizes, the one that has the greater amount of memory by at least 10% wins, as this is (again) good for keeping the fragmentation small. Finally, the number of domains running on the nodes involved in the combinations is checked, and the "least populated" candidate is the one that is considered. This makes the wholle automatic NUMA placement mechanism very similar to what xm/xend does, although no memory considerations are present there. Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx> diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h --- a/tools/libxl/libxl.h +++ b/tools/libxl/libxl.h @@ -633,6 +633,8 @@ libxl_numa_candidate *libxl_domain_numa_ int libxl_numa_candidate_add_cpus(libxl_ctx *ctx, int min_cpus, int max_nodes, libxl_numa_candidate *candidate); +int libxl_numa_candidate_count_domains(libxl_ctx *ctx, + libxl_numa_candidate *candidate); void libxl_numa_candidates_list_free(libxl_numa_candidate *list, int nr); /* diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl --- a/tools/libxl/libxl_types.idl +++ b/tools/libxl/libxl_types.idl @@ -441,6 +441,7 @@ libxl_cputopology = Struct("cputopology" libxl_numa_candidate = Struct("numa_candidate", [ ("nr_nodes", integer), + ("nr_domains", integer), ("free_memkb", uint32), ("nodemap", libxl_nodemap), ], dir=DIR_OUT) diff --git a/tools/libxl/libxl_utils.c b/tools/libxl/libxl_utils.c --- a/tools/libxl/libxl_utils.c +++ b/tools/libxl/libxl_utils.c @@ -849,6 +849,70 @@ int libxl_numa_candidate_add_cpus(libxl_ return rc; } +int libxl_numa_candidate_count_domains(libxl_ctx *ctx, + libxl_numa_candidate *candidate) +{ + libxl_nodemap dom_nodemap; + libxl_cputopology *tinfo; + int nr_doms, nr_cpus, rc = 0; + libxl_dominfo *dinfo; + int i, j, k; + + dinfo = libxl_list_domain(ctx, &nr_doms); + if (dinfo == NULL) { + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "libxl_list_domain failed\n"); + rc = ERROR_NOMEM; + goto out; + } + + if (libxl_nodemap_alloc(ctx, &dom_nodemap) < 0) { + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "libxl_nodemap_alloc failed\n"); + rc = ERROR_NOMEM; + goto out_dominfo; + } + + tinfo = libxl_get_cpu_topology(ctx, &nr_cpus); + if (tinfo == NULL) { + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "libxl_get_topologyinfo failed\n"); + rc = ERROR_NOMEM; + goto out_nodemap; + } + + candidate->nr_domains = 0; + for (i = 0; i < nr_doms; i++) { + libxl_vcpuinfo *vinfo; + int nr_vcpus, nr_cpus; + + vinfo = libxl_list_vcpu(ctx, dinfo[i].domid, &nr_vcpus, &nr_cpus); + if (vinfo == NULL) + continue; + + libxl_nodemap_set_none(&dom_nodemap); + for (j = 0; j < nr_vcpus; j++) { + libxl_for_each_set_cpu(k, vinfo[j].cpumap) + libxl_nodemap_set(&dom_nodemap, tinfo[k].node); + } + + libxl_for_each_set_node(j, dom_nodemap) { + if (libxl_nodemap_test(&candidate->nodemap, j)) { + candidate->nr_domains++; + goto found; + } + } + found: + libxl_vcpuinfo_list_free(vinfo, nr_vcpus); + } + + + libxl_cputopology_list_free(tinfo, nr_cpus); + out_nodemap: + libxl_nodemap_dispose(&dom_nodemap); + out_dominfo: + libxl_dominfo_list_free(dinfo, nr_doms); + out: + return rc; +} + void libxl_numa_candidates_list_free(libxl_numa_candidate *list, int nr) { int i; diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c --- a/tools/libxl/xl_cmdimpl.c +++ b/tools/libxl/xl_cmdimpl.c @@ -522,6 +522,34 @@ static int cpus_per_node(libxl_cputopolo return cpus_nodes; } +/* + * The NUMA placement candidates are reordered according to the following + * heuristics: + * - candidates involving fewer nodes come first. In case two (or + * more) candidates span the same number of nodes, + * - candidates with greater amount of free memory come first. In + * case two (or more) candidates differ in their amount of free + * memory by less than 10%, + * - candidates with fewer domains insisting on them at the time of + * this call come first. + */ +static int candidates_cmpf(const void *v1, const void *v2) +{ + const libxl_numa_candidate *c1 = (const libxl_numa_candidate*) v1; + const libxl_numa_candidate *c2 = (const libxl_numa_candidate*) v2; + double mem_diff = labs(c1->free_memkb - c2->free_memkb); + double mem_avg = (c1->free_memkb + c2->free_memkb) / 2.0; + + if (c1->nr_nodes != c2->nr_nodes) + return c1->nr_nodes - c2->nr_nodes; + + if ((mem_diff / mem_avg) * 100.0 < 10.0 && + c1->nr_domains != c2->nr_domains) + return c1->nr_domains - c2->nr_domains; + + return c2->free_memkb - c1->free_memkb; +} + /* Try to achieve "optimal" NUMA placement */ static int place_domain(libxl_domain_build_info *b_info) { @@ -575,6 +603,18 @@ static int place_domain(libxl_domain_bui goto out_topologyinfo; } + /* Account for the number of domains insisting on a candidate placement */ + for (i = 0; i < nr_candidates; i++) { + if (libxl_numa_candidate_count_domains(ctx, &candidates[i])) { + fprintf(stderr, "libxl_numa_candidate_count_domains failed\n"); + err = ENOMEM; + goto out_cndtslist; + } + } + + /* Reorder candidates (see @candidates_cmpf for the heuristics) */ + qsort(candidates, nr_candidates, sizeof(candidates[0]), candidates_cmpf); + /* Pick a candidate and ensure it gives us enough PCPUs */ dom_max_nodes = -1; err = ERROR_FAIL; for (candidate = 0; err && candidate < nr_candidates; candidate++) { @@ -596,6 +636,7 @@ static int place_domain(libxl_domain_bui } } +out_cndtslist: libxl_numa_candidates_list_free(candidates, nr_candidates); out_topologyinfo: libxl_cputopology_list_free(tinfo, nr_cpus); _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |