[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC][PATCH 04/13] tools/libxl: detect and avoid conflicts with RDM



While building a VM, HVM domain builder provides struct hvm_info_table{}
to help hvmloader. Currently it includes two fields to construct guest
e820 table by hvmloader, low_mem_pgend and high_mem_pgend. So we should
check them to fix any conflict with RAM.

RMRR can reside in address space beyond 4G theoretically, but we never
see this in real world. So in order to avoid breaking highmem layout
we don't solve highmem conflict. Note this means highmem rmrr could still
be supported if no conflict.

But in the case of lowmem, RMRR probably scatter the whole RAM space.
Especially multiple RMRR entries would worsen this to lead a complicated
memory layout. And then its hard to extend hvm_info_table{} to work
hvmloader out. So here we're trying to figure out a simple solution to
avoid breaking existing layout. So when a conflict occurs,

    #1. Above a predefined boundary (default 2G)
        - move lowmem_end below reserved region to solve conflict;

    #2 Below a predefined boundary (default 2G)
        - Check force/try policy.
        "force" policy leads to fail libxl. Note when both policies
        are specified on a given region, 'force' is always preferred.
        "try" policy issue a warning message and also mask this entry INVALID
        to indicate we shouldn't expose this entry to hvmloader.

Signed-off-by: Tiejun Chen <tiejun.chen@xxxxxxxxx>
---
 tools/libxc/include/xenctrl.h  |   8 ++
 tools/libxc/include/xenguest.h |   3 +-
 tools/libxc/xc_domain.c        |  40 +++++++++
 tools/libxc/xc_hvm_build_x86.c |  28 +++---
 tools/libxl/libxl_create.c     |   2 +-
 tools/libxl/libxl_dm.c         | 195 +++++++++++++++++++++++++++++++++++++++++
 tools/libxl/libxl_dom.c        |  27 +++++-
 tools/libxl/libxl_internal.h   |  11 ++-
 tools/libxl/libxl_types.idl    |   7 ++
 9 files changed, 303 insertions(+), 18 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 59bbe06..299b95f 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2053,6 +2053,14 @@ int xc_get_device_group(xc_interface *xch,
                      uint32_t *num_sdevs,
                      uint32_t *sdev_array);
 
+struct xen_reserved_device_memory
+*xc_device_get_rdm(xc_interface *xch,
+                   uint32_t flag,
+                   uint16_t seg,
+                   uint8_t bus,
+                   uint8_t devfn,
+                   unsigned int *nr_entries);
+
 int xc_test_assign_device(xc_interface *xch,
                           uint32_t domid,
                           uint32_t machine_sbdf);
diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index 40bbac8..0f1c23b 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -218,7 +218,8 @@ struct xc_hvm_firmware_module {
 };
 
 struct xc_hvm_build_args {
-    uint64_t mem_size;           /* Memory size in bytes. */
+    uint64_t lowmem_size;        /* All low memory size in bytes. */
+    uint64_t mem_size;           /* All memory size in bytes. */
     uint64_t mem_target;         /* Memory target in bytes. */
     uint64_t mmio_size;          /* Size of the MMIO hole in bytes. */
     const char *image_file_name; /* File name of the image to load. */
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index 4f8383e..85b18ea 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -1665,6 +1665,46 @@ int xc_assign_device(
     return do_domctl(xch, &domctl);
 }
 
+struct xen_reserved_device_memory
+*xc_device_get_rdm(xc_interface *xch,
+                   uint32_t flag,
+                   uint16_t seg,
+                   uint8_t bus,
+                   uint8_t devfn,
+                   unsigned int *nr_entries)
+{
+    struct xen_reserved_device_memory *xrdm = NULL;
+    int rc = xc_reserved_device_memory_map(xch, flag, seg, bus, devfn, xrdm,
+                                           nr_entries);
+
+    if ( rc < 0 )
+    {
+        if ( errno == ENOBUFS )
+        {
+            if ( (xrdm = malloc(*nr_entries *
+                                sizeof(xen_reserved_device_memory_t))) == NULL 
)
+            {
+                PERROR("Could not allocate memory.");
+                goto out;
+            }
+            rc = xc_reserved_device_memory_map(xch, flag, seg, bus, devfn, 
xrdm,
+                                               nr_entries);
+            if ( rc )
+            {
+                PERROR("Could not get reserved device memory maps.");
+                free(xrdm);
+                xrdm = NULL;
+            }
+        }
+        else {
+            PERROR("Could not get reserved device memory maps.");
+        }
+    }
+
+ out:
+    return xrdm;
+}
+
 int xc_get_device_group(
     xc_interface *xch,
     uint32_t domid,
diff --git a/tools/libxc/xc_hvm_build_x86.c b/tools/libxc/xc_hvm_build_x86.c
index c81a25b..3f87bb3 100644
--- a/tools/libxc/xc_hvm_build_x86.c
+++ b/tools/libxc/xc_hvm_build_x86.c
@@ -89,19 +89,16 @@ static int modules_init(struct xc_hvm_build_args *args,
 }
 
 static void build_hvm_info(void *hvm_info_page, uint64_t mem_size,
-                           uint64_t mmio_start, uint64_t mmio_size)
+                           uint64_t lowmem_end)
 {
     struct hvm_info_table *hvm_info = (struct hvm_info_table *)
         (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
-    uint64_t lowmem_end = mem_size, highmem_end = 0;
+    uint64_t highmem_end = 0;
     uint8_t sum;
     int i;
 
-    if ( lowmem_end > mmio_start )
-    {
-        highmem_end = (1ull<<32) + (lowmem_end - mmio_start);
-        lowmem_end = mmio_start;
-    }
+    if ( mem_size > lowmem_end )
+        highmem_end = (1ull<<32) + (mem_size - lowmem_end);
 
     memset(hvm_info_page, 0, PAGE_SIZE);
 
@@ -252,7 +249,7 @@ static int setup_guest(xc_interface *xch,
     void *hvm_info_page;
     uint32_t *ident_pt;
     struct elf_binary elf;
-    uint64_t v_start, v_end;
+    uint64_t v_start, v_end, v_lowend, lowmem_end;
     uint64_t m_start = 0, m_end = 0;
     int rc;
     xen_capabilities_info_t caps;
@@ -275,6 +272,7 @@ static int setup_guest(xc_interface *xch,
     elf_parse_binary(&elf);
     v_start = 0;
     v_end = args->mem_size;
+    v_lowend = lowmem_end = args->lowmem_size;
 
     if ( xc_version(xch, XENVER_capabilities, &caps) != 0 )
     {
@@ -302,8 +300,11 @@ static int setup_guest(xc_interface *xch,
 
     for ( i = 0; i < nr_pages; i++ )
         page_array[i] = i;
-    for ( i = mmio_start >> PAGE_SHIFT; i < nr_pages; i++ )
-        page_array[i] += mmio_size >> PAGE_SHIFT;
+    /*
+     * This condition 'lowmem_end <= mmio_start' is always true.
+     */
+    for ( i = lowmem_end >> PAGE_SHIFT; i < nr_pages; i++ )
+        page_array[i] += ((1ull << 32) - lowmem_end) >> PAGE_SHIFT;
 
     /*
      * Try to claim pages for early warning of insufficient memory available.
@@ -469,7 +470,7 @@ static int setup_guest(xc_interface *xch,
               xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
               HVM_INFO_PFN)) == NULL )
         goto error_out;
-    build_hvm_info(hvm_info_page, v_end, mmio_start, mmio_size);
+    build_hvm_info(hvm_info_page, v_end, v_lowend);
     munmap(hvm_info_page, PAGE_SIZE);
 
     /* Allocate and clear special pages. */
@@ -588,9 +589,6 @@ int xc_hvm_build(xc_interface *xch, uint32_t domid,
     if ( args.mem_target == 0 )
         args.mem_target = args.mem_size;
 
-    if ( args.mmio_size == 0 )
-        args.mmio_size = HVM_BELOW_4G_MMIO_LENGTH;
-
     /* An HVM guest must be initialised with at least 2MB memory. */
     if ( args.mem_size < (2ull << 20) || args.mem_target < (2ull << 20) )
         return -1;
@@ -634,6 +632,8 @@ int xc_hvm_build_target_mem(xc_interface *xch,
     args.mem_size = (uint64_t)memsize << 20;
     args.mem_target = (uint64_t)target << 20;
     args.image_file_name = image_name;
+    if ( args.mmio_size == 0 )
+        args.mmio_size = HVM_BELOW_4G_MMIO_LENGTH;
 
     return xc_hvm_build(xch, domid, &args);
 }
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 9ed40d4..eab86fd 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -430,7 +430,7 @@ int libxl__domain_build(libxl__gc *gc,
 
     switch (info->type) {
     case LIBXL_DOMAIN_TYPE_HVM:
-        ret = libxl__build_hvm(gc, domid, info, state);
+        ret = libxl__build_hvm(gc, domid, d_config, state);
         if (ret)
             goto out;
 
diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index a8b08f2..9ad04ae 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -90,6 +90,201 @@ const char *libxl__domain_device_model(libxl__gc *gc,
     return dm;
 }
 
+/*
+ * Check whether there exists rdm hole in the specified memory range.
+ * Returns 1 if exists, else returns 0.
+ */
+static int check_rdm_hole(uint64_t start, uint64_t memsize,
+                          uint64_t rdm_start, uint64_t rdm_size)
+{
+    if ( start + memsize <= rdm_start || start >= rdm_start + rdm_size )
+        return 0;
+    else
+        return 1;
+}
+
+/*
+ * Check reported RDM regions and handle potential gfn conflicts according
+ * to user preferred policy.
+ */
+int libxl__domain_device_check_rdm(libxl__gc *gc,
+                                   libxl_domain_config *d_config,
+                                   uint64_t rdm_mem_guard,
+                                   struct xc_hvm_build_args *args)
+{
+    int i, j, conflict;
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+    struct xen_reserved_device_memory *xrdm = NULL;
+    unsigned int nr_all_rdms = 0;
+    uint64_t rdm_start, rdm_size, highmem_end = (1ULL << 32);
+    uint32_t type = d_config->b_info.rdm.type;
+    uint16_t seg;
+    uint8_t bus, devfn;
+
+    /* Might not to expose rdm. */
+    if ((type == LIBXL_RDM_RESERVE_TYPE_NONE) && !d_config->num_pcidevs)
+        return 0;
+
+    /* Collect all rdm info if exist. */
+    xrdm = xc_device_get_rdm(ctx->xch, LIBXL_RDM_RESERVE_TYPE_HOST,
+                             0, 0, 0, &nr_all_rdms);
+    if (!nr_all_rdms)
+        return 0;
+    d_config->rdms = libxl__calloc(gc, nr_all_rdms,
+                                   sizeof(libxl_device_rdm));
+    memset(d_config->rdms, 0, sizeof(libxl_device_rdm));
+
+    /* Query all RDM entries in this platform */
+    if (type == LIBXL_RDM_RESERVE_TYPE_HOST) {
+        d_config->num_rdms = nr_all_rdms;
+        for (i = 0; i < d_config->num_rdms; i++) {
+            d_config->rdms[i].start =
+                                (uint64_t)xrdm[i].start_pfn << XC_PAGE_SHIFT;
+            d_config->rdms[i].size =
+                                (uint64_t)xrdm[i].nr_pages << XC_PAGE_SHIFT;
+            d_config->rdms[i].flag = d_config->b_info.rdm.reserve;
+        }
+    } else {
+        d_config->num_rdms = 0;
+    }
+    free(xrdm);
+
+    /* Query RDM entries per-device */
+    for (i = 0; i < d_config->num_pcidevs; i++) {
+        unsigned int nr_entries = 0;
+        bool new = true;
+        seg = d_config->pcidevs[i].domain;
+        bus = d_config->pcidevs[i].bus;
+        devfn = PCI_DEVFN(d_config->pcidevs[i].dev, d_config->pcidevs[i].func);
+        nr_entries = 0;
+        xrdm = xc_device_get_rdm(ctx->xch, LIBXL_RDM_RESERVE_TYPE_NONE,
+                                 seg, bus, devfn, &nr_entries);
+        /* No RDM to associated with this device. */
+        if (!nr_entries)
+            continue;
+
+        /* Need to check whether this entry is already saved in the array.
+         * This could come from two cases:
+         *
+         *   - user may configure to get all RMRRs in this platform, which
+         * is already queried before this point
+         *   - or two assigned devices may share one RMRR entry
+         *
+         * different policies may be configured on the same RMRR due to above
+         * two cases. We choose a simple policy to always favor stricter policy
+         */
+        for (j = 0; j < d_config->num_rdms; j++) {
+            if (d_config->rdms[j].start ==
+                                (uint64_t)xrdm[0].start_pfn << XC_PAGE_SHIFT)
+             {
+                if (d_config->rdms[j].flag != LIBXL_RDM_RESERVE_FLAG_FORCE)
+                    d_config->rdms[j].flag = d_config->pcidevs[i].rdm_reserve;
+                new = false;
+                break;
+            }
+        }
+
+        if (new) {
+            if (d_config->num_rdms > nr_all_rdms - 1) {
+                LIBXL__LOG(CTX, LIBXL__LOG_ERROR, "Exceed rdm array 
boundary!\n");
+                free(xrdm);
+                return -1;
+            }
+
+            /*
+             * This is a new entry.
+             */
+            d_config->rdms[d_config->num_rdms].start =
+                                (uint64_t)xrdm[0].start_pfn << XC_PAGE_SHIFT;
+            d_config->rdms[d_config->num_rdms].size =
+                                (uint64_t)xrdm[0].nr_pages << XC_PAGE_SHIFT;
+            d_config->rdms[d_config->num_rdms].flag = 
d_config->pcidevs[i].rdm_reserve;
+            d_config->num_rdms++;
+        }
+        free(xrdm);
+    }
+
+    /* Fix highmem. */
+    if (args->mem_size > args->lowmem_size)
+        highmem_end += (args->mem_size - args->lowmem_size);
+    /* Next step is to check and avoid potential conflict between RDM entries
+     * and guest RAM. To avoid intrusive impact to existing memory layout
+     * {lowmem, mmio, highmem} which is passed around various function blocks,
+     * below conflicts are not handled which are rare and handling them would
+     * lead to a more scattered layout:
+     *  - RMRR in highmem area (>4G)
+     *  - RMRR lower than a defined memory boundary (e.g. 2G)
+     * Otherwise for conflicts between boundary and 4G, we'll simply move 
lowmem
+     * end below reserved region to solve conflict.
+     *
+     * If a conflict is detected on a given RMRR entry, an error will be
+     * returned.
+     * If 'force' policy is specified. Or conflict is treated as a warning if
+     * 'try' policy is specified, and we also mark this as INVALID not to 
expose
+     * this entry to hvmloader.
+     *
+     * Firstly we should check the case of rdm < 4G because we may need to
+     * expand highmem_end.
+     */
+    for (i = 0; i < d_config->num_rdms; i++) {
+        rdm_start = d_config->rdms[i].start;
+        rdm_size = d_config->rdms[i].size;
+        conflict = check_rdm_hole(0, args->lowmem_size, rdm_start, rdm_size);
+
+        if (!conflict)
+            continue;
+
+        /*
+         * Just check if RDM > our memory boundary
+         */
+        if (d_config->rdms[i].start > rdm_mem_guard) {
+            /*
+             * We will move downwards lowmem_end so we have to expand
+             * highmem_end.
+             */
+            highmem_end += (args->lowmem_size - rdm_start);
+            /* Now move downwards lowmem_end. */
+            args->lowmem_size = rdm_start;
+        }
+    }
+
+    /*
+     * Finally we can take same policy to check lowmem(< 2G) and
+     * highmem adjusted above.
+     */
+    for (i = 0; i < d_config->num_rdms; i++) {
+        rdm_start = d_config->rdms[i].start;
+        rdm_size = d_config->rdms[i].size;
+        /* Does this entry conflict with lowmem? */
+        conflict = check_rdm_hole(0, args->lowmem_size,
+                                  rdm_start, rdm_size);
+        /* Does this entry conflict with highmem? */
+        conflict |= check_rdm_hole((1ULL<<32), highmem_end,
+                                   rdm_start, rdm_size);
+
+        if (!conflict)
+            continue;
+
+        if(d_config->rdms[i].flag == LIBXL_RDM_RESERVE_FLAG_FORCE) {
+            LIBXL__LOG(CTX, LIBXL__LOG_ERROR, "RDM conflict at 0x%lx.\n",
+                                                d_config->rdms[i].start);
+            return -1;
+        } else {
+            LIBXL__LOG(CTX, LIBXL__LOG_WARNING,
+                                        "Ignoring RDM conflict at 0x%lx.\n",
+                                        d_config->rdms[i].start);
+
+            /*
+             * Then mask this INVALID to indicate we shouldn't expose this
+             * to hvmloader.
+             */
+            d_config->rdms[i].flag = LIBXL_RDM_RESERVE_FLAG_INVALID;
+        }
+    }
+
+    return 0;
+}
+
 const libxl_vnc_info *libxl__dm_vnc(const libxl_domain_config *guest_config)
 {
     const libxl_vnc_info *vnc = NULL;
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index a16d4a1..ee67282 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -788,12 +788,14 @@ out:
 }
 
 int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
-              libxl_domain_build_info *info,
+              libxl_domain_config *d_config,
               libxl__domain_build_state *state)
 {
     libxl_ctx *ctx = libxl__gc_owner(gc);
     struct xc_hvm_build_args args = {};
     int ret, rc = ERROR_FAIL;
+    libxl_domain_build_info *const info = &d_config->b_info;
+    uint64_t rdm_mem_boundary, mmio_start;
 
     memset(&args, 0, sizeof(struct xc_hvm_build_args));
     /* The params from the configuration file are in Mb, which are then
@@ -802,6 +804,8 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
      * Do all this in one step here...
      */
     args.mem_size = (uint64_t)(info->max_memkb - info->video_memkb) << 10;
+    args.lowmem_size = (args.mem_size > (1ULL << 32)) ?
+                            (1ULL << 32) : args.mem_size;
     args.mem_target = (uint64_t)(info->target_memkb - info->video_memkb) << 10;
     args.claim_enabled = libxl_defbool_val(info->claim_mode);
     if (info->u.hvm.mmio_hole_memkb) {
@@ -811,6 +815,27 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
         if (max_ram_below_4g < HVM_BELOW_4G_MMIO_START)
             args.mmio_size = info->u.hvm.mmio_hole_memkb << 10;
     }
+
+    if (args.mmio_size == 0)
+        args.mmio_size = HVM_BELOW_4G_MMIO_LENGTH;
+    mmio_start = (1ull << 32) - args.mmio_size;
+
+    if (args.lowmem_size > mmio_start)
+        args.lowmem_size = mmio_start;
+
+    /*
+     * We'd like to set a memory boundary to determine if we need to check
+     * any overlap with reserved device memory.
+     *
+     * TODO: we will add a config parameter for this boundary value next.
+     */
+    rdm_mem_boundary = 0x80000000;
+    ret = libxl__domain_device_check_rdm(gc, d_config, rdm_mem_boundary, 
&args);
+    if (ret) {
+        LOG(ERROR, "checking reserved device memory failed");
+        goto out;
+    }
+
     if (libxl__domain_firmware(gc, info, &args)) {
         LOG(ERROR, "initializing domain firmware failed");
         goto out;
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 934465a..64d05b3 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -985,7 +985,7 @@ _hidden int libxl__build_post(libxl__gc *gc, uint32_t domid,
 _hidden int libxl__build_pv(libxl__gc *gc, uint32_t domid,
              libxl_domain_build_info *info, libxl__domain_build_state *state);
 _hidden int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
-              libxl_domain_build_info *info,
+              libxl_domain_config *d_config,
               libxl__domain_build_state *state);
 
 _hidden int libxl__qemu_traditional_cmd(libxl__gc *gc, uint32_t domid,
@@ -1480,6 +1480,15 @@ _hidden int libxl__need_xenpv_qemu(libxl__gc *gc,
         int nr_channels, libxl_device_channel *channels);
 
 /*
+ * This function will fix reserved device memory conflict
+ * according to user's configuration.
+ */
+_hidden int libxl__domain_device_check_rdm(libxl__gc *gc,
+                                   libxl_domain_config *d_config,
+                                   uint64_t rdm_mem_guard,
+                                   struct xc_hvm_build_args *args);
+
+/*
  * This function will cause the whole libxl process to hang
  * if the device model does not respond.  It is deprecated.
  *
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 5786455..218931b 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -541,6 +541,12 @@ libxl_device_pci = Struct("device_pci", [
     ("rdm_reserve",   libxl_rdm_reserve_flag),
     ])
 
+libxl_device_rdm = Struct("device_rdm", [
+    ("start", uint64),
+    ("size", uint64),
+    ("flag", bool),
+    ])
+
 libxl_device_vtpm = Struct("device_vtpm", [
     ("backend_domid",    libxl_domid),
     ("backend_domname",  string),
@@ -567,6 +573,7 @@ libxl_domain_config = Struct("domain_config", [
     ("disks", Array(libxl_device_disk, "num_disks")),
     ("nics", Array(libxl_device_nic, "num_nics")),
     ("pcidevs", Array(libxl_device_pci, "num_pcidevs")),
+    ("rdms", Array(libxl_device_rdm, "num_rdms")),
     ("vfbs", Array(libxl_device_vfb, "num_vfbs")),
     ("vkbs", Array(libxl_device_vkb, "num_vkbs")),
     ("vtpms", Array(libxl_device_vtpm, "num_vtpms")),
-- 
1.9.1


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.