WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] Remus: Make checkpoint buffering HVM-awar

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] Remus: Make checkpoint buffering HVM-aware
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Mon, 09 Nov 2009 12:00:45 -0800
Delivery-date: Mon, 09 Nov 2009 12:01:42 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1257794367 0
# Node ID f6091947ffed5707b60120fe44d3d038a3307591
# Parent  01c9cb566f61f143870600b30857562963aaee27
Remus: Make checkpoint buffering HVM-aware

Signed-off-by: Brendan Cully <brendan@xxxxxxxxx>
---
 tools/libxc/xc_domain_restore.c         |  387 +++++++++++++++++++++++---------
 tools/python/xen/xend/XendCheckpoint.py |   20 -
 2 files changed, 289 insertions(+), 118 deletions(-)

diff -r 01c9cb566f61 -r f6091947ffed tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Mon Nov 09 19:17:22 2009 +0000
+++ b/tools/libxc/xc_domain_restore.c   Mon Nov 09 19:19:27 2009 +0000
@@ -670,15 +670,204 @@ static xen_pfn_t *load_p2m_frame_list(
 }
 
 typedef struct {
-    unsigned int pfncount;
-    unsigned long* pfntab;
-    unsigned int vcpucount;
-    unsigned char* vcpubuf;
-    unsigned char shared_info_page[PAGE_SIZE];
+    int ishvm;
+    union {
+        struct tailbuf_pv {
+            unsigned int pfncount;
+            unsigned long* pfntab;
+            unsigned int vcpucount;
+            unsigned char* vcpubuf;
+            unsigned char shared_info_page[PAGE_SIZE];
+        } pv;
+        struct tailbuf_hvm {
+            uint64_t magicpfns[3];
+            uint32_t hvmbufsize, reclen;
+            uint8_t* hvmbuf;
+            struct {
+                uint32_t magic;
+                uint32_t version;
+                uint64_t len;
+            } qemuhdr;
+            uint32_t qemubufsize;
+            uint8_t* qemubuf;
+        } hvm;
+    } u;
 } tailbuf_t;
 
-static int buffer_tail(tailbuf_t* buf, int fd, unsigned int max_vcpu_id,
-                      uint64_t vcpumap, int ext_vcpucontext)
+/* read stream until EOF, growing buffer as necssary */
+static int compat_buffer_qemu(int fd, struct tailbuf_hvm *buf)
+{
+    uint8_t *qbuf, *tmp;
+    int blen = 0, dlen = 0;
+    int rc;
+
+    /* currently save records tend to be about 7K */
+    blen = 8192;
+    if ( !(qbuf = malloc(blen)) ) {
+        ERROR("Error allocating QEMU buffer");
+        return -1;
+    }
+
+    while( (rc = read(fd, qbuf+dlen, blen-dlen)) > 0 ) {
+        DPRINTF("Read %d bytes of QEMU data\n", rc);
+        dlen += rc;
+
+        if (dlen == blen) {
+            DPRINTF("%d-byte QEMU buffer full, reallocating...\n", dlen);
+            blen += 4096;
+            tmp = realloc(qbuf, blen);
+            if ( !tmp ) {
+                ERROR("Error growing QEMU buffer to %d bytes", blen);
+                free(qbuf);
+                return -1;
+            }
+            qbuf = tmp;
+        }
+    }
+
+    if ( rc < 0 ) {
+        ERROR("Error reading QEMU data");
+        free(qbuf);
+        return -1;
+    }
+
+    if ( memcmp(qbuf, "QEVM", 4) ) {
+        ERROR("Invalid QEMU magic: 0x%08x", *(unsigned long*)qbuf);
+        free(qbuf);
+        return -1;
+    }
+
+    buf->qemubuf = qbuf;
+    buf->qemubufsize = dlen;
+
+    return 0;
+}
+
+static int buffer_qemu(int fd, struct tailbuf_hvm *buf)
+{
+    uint32_t qlen;
+    uint8_t *tmp;
+
+    if ( read_exact(fd, &qlen, sizeof(qlen)) ) {
+        ERROR("Error reading QEMU header length");
+        return -1;
+    }
+
+    if ( qlen > buf->qemubufsize ) {
+        if ( buf->qemubuf) {
+            tmp = realloc(buf->qemubuf, qlen);
+            if ( tmp )
+                buf->qemubuf = tmp;
+            else {
+                ERROR("Error reallocating QEMU state buffer");
+                return -1;
+            }
+        } else {
+            buf->qemubuf = malloc(qlen);
+            if ( !buf->qemubuf ) {
+                ERROR("Error allocating QEMU state buffer");
+                return -1;
+            }
+        }
+    }
+    buf->qemubufsize = qlen;
+
+    if ( read_exact(fd, buf->qemubuf, buf->qemubufsize) ) {
+        ERROR("Error reading QEMU state");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int dump_qemu(uint32_t dom, struct tailbuf_hvm *buf)
+{
+    int saved_errno;
+    char path[256];
+    FILE *fp;
+
+    sprintf(path, "/var/lib/xen/qemu-save.%u", dom);
+    fp = fopen(path, "wb");
+    if ( !fp )
+        return -1;
+
+    DPRINTF("Writing %d bytes of QEMU data\n", buf->qemubufsize);
+    if ( fwrite(buf->qemubuf, 1, buf->qemubufsize, fp) != buf->qemubufsize) {
+        saved_errno = errno;
+        fclose(fp);
+        errno = saved_errno;
+        return -1;
+    }
+
+    fclose(fp);
+
+    return 0;
+}
+
+static int buffer_tail_hvm(struct tailbuf_hvm *buf, int fd,
+                           unsigned int max_vcpu_id, uint64_t vcpumap,
+                           int ext_vcpucontext)
+{
+    uint8_t *tmp;
+    unsigned char qemusig[21];
+
+    if ( read_exact(fd, buf->magicpfns, sizeof(buf->magicpfns)) ) {
+        ERROR("Error reading magic PFNs");
+        return -1;
+    }
+
+    if ( read_exact(fd, &buf->reclen, sizeof(buf->reclen)) ) {
+        ERROR("Error reading HVM params size");
+        return -1;
+    }
+
+    if ( buf->reclen > buf->hvmbufsize ) {
+        if ( buf->hvmbuf) {
+            tmp = realloc(buf->hvmbuf, buf->reclen);
+            if ( tmp ) {
+                buf->hvmbuf = tmp;
+                buf->hvmbufsize = buf->reclen;
+            } else {
+                ERROR("Error reallocating HVM param buffer");
+                return -1;
+            }
+        } else {
+            buf->hvmbuf = malloc(buf->reclen);
+            if ( !buf->hvmbuf ) {
+                ERROR("Error allocating HVM param buffer");
+                return -1;
+            }
+            buf->hvmbufsize = buf->reclen;
+        }
+    }
+
+    if ( read_exact(fd, buf->hvmbuf, buf->reclen) ) {
+        ERROR("Error reading HVM params");
+        return -1;
+    }
+
+    if ( read_exact(fd, qemusig, sizeof(qemusig)) ) {
+        ERROR("Error reading QEMU signature");
+        return -1;
+    }
+
+    /* The normal live-migration QEMU record has no length information.
+     * Short of reimplementing the QEMU parser, we're forced to just read
+     * until EOF. Remus gets around this by sending a different signature
+     * which includes a length prefix */
+    if ( !memcmp(qemusig, "QemuDeviceModelRecord", sizeof(qemusig)) )
+        return compat_buffer_qemu(fd, buf);
+    else if ( !memcmp(qemusig, "RemusDeviceModelState", sizeof(qemusig)) )
+        return buffer_qemu(fd, buf);
+
+    qemusig[20] = '\0';
+    ERROR("Invalid QEMU signature: %s", qemusig);
+    return -1;
+}
+
+static int buffer_tail_pv(struct tailbuf_pv *buf, int fd,
+                          unsigned int max_vcpu_id, uint64_t vcpumap,
+                          int ext_vcpucontext)
 {
     unsigned int i;
     size_t pfnlen, vcpulen;
@@ -753,16 +942,47 @@ static int buffer_tail(tailbuf_t* buf, i
     return -1;
 }
 
-static void tailbuf_free(tailbuf_t* buf)
-{
-    if (buf->vcpubuf) {
+static int buffer_tail(tailbuf_t *buf, int fd, unsigned int max_vcpu_id,
+                       uint64_t vcpumap, int ext_vcpucontext)
+{
+    if ( buf->ishvm )
+        return buffer_tail_hvm(&buf->u.hvm, fd, max_vcpu_id, vcpumap,
+                               ext_vcpucontext);
+    else
+        return buffer_tail_pv(&buf->u.pv, fd, max_vcpu_id, vcpumap,
+                              ext_vcpucontext);
+}
+
+static void tailbuf_free_hvm(struct tailbuf_hvm *buf)
+{
+    if ( buf->hvmbuf ) {
+        free(buf->hvmbuf);
+        buf->hvmbuf = NULL;
+    }
+    if ( buf->qemubuf ) {
+        free(buf->qemubuf);
+        buf->qemubuf = NULL;
+    }
+}
+
+static void tailbuf_free_pv(struct tailbuf_pv *buf)
+{
+    if ( buf->vcpubuf ) {
         free(buf->vcpubuf);
         buf->vcpubuf = NULL;
     }
-    if (buf->pfntab) {
+    if ( buf->pfntab ) {
         free(buf->pfntab);
         buf->pfntab = NULL;
     }
+}
+
+static void tailbuf_free(tailbuf_t *buf)
+{
+    if ( buf->ishvm )
+        tailbuf_free_hvm(&buf->u.hvm);
+    else
+        tailbuf_free_pv(&buf->u.pv);
 }
 
 typedef struct {
@@ -1118,18 +1338,13 @@ int xc_domain_restore(int xc_handle, int
     unsigned int max_vcpu_id = 0;
     int new_ctxt_format = 0;
 
-    /* Magic frames in HVM guests: ioreqs and xenstore comms. */
-    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
-
-    /* Buffer for holding HVM context */
-    uint8_t *hvm_buf = NULL;
-
     pagebuf_t pagebuf;
     tailbuf_t tailbuf, tmptail;
     void* vcpup;
 
     pagebuf_init(&pagebuf);
     memset(&tailbuf, 0, sizeof(tailbuf));
+    tailbuf.ishvm = hvm;
 
     /* For info only */
     nr_pfns = 0;
@@ -1313,78 +1528,6 @@ int xc_domain_restore(int xc_handle, int
 
     // DPRINTF("Received all pages (%d races)\n", nraces);
 
-    if ( hvm ) 
-    {
-        uint32_t rec_len;
-
-        /* Set HVM-specific parameters */
-        if ( read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
-        {
-            ERROR("error reading magic page addresses");
-            goto out;
-        }
-        
-        /* These comms pages need to be zeroed at the start of day */
-        if ( xc_clear_domain_page(xc_handle, dom, magic_pfns[0]) ||
-             xc_clear_domain_page(xc_handle, dom, magic_pfns[1]) ||
-             xc_clear_domain_page(xc_handle, dom, magic_pfns[2]) )
-        {
-            ERROR("error zeroing magic pages");
-            goto out;
-        }
-                
-        if ( (frc = xc_set_hvm_param(xc_handle, dom, 
-                                     HVM_PARAM_IOREQ_PFN, magic_pfns[0]))
-             || (frc = xc_set_hvm_param(xc_handle, dom, 
-                                        HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1]))
-             || (frc = xc_set_hvm_param(xc_handle, dom, 
-                                        HVM_PARAM_STORE_PFN, magic_pfns[2]))
-             || (frc = xc_set_hvm_param(xc_handle, dom, 
-                                        HVM_PARAM_PAE_ENABLED, pae))
-             || (frc = xc_set_hvm_param(xc_handle, dom, 
-                                        HVM_PARAM_STORE_EVTCHN,
-                                        store_evtchn)) )
-        {
-            ERROR("error setting HVM params: %i", frc);
-            goto out;
-        }
-        *store_mfn = magic_pfns[2];
-
-        /* Read HVM context */
-        if ( read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
-        {
-            ERROR("error read hvm context size!\n");
-            goto out;
-        }
-        
-        hvm_buf = malloc(rec_len);
-        if ( hvm_buf == NULL )
-        {
-            ERROR("memory alloc for hvm context buffer failed");
-            errno = ENOMEM;
-            goto out;
-        }
-        
-        if ( read_exact(io_fd, hvm_buf, rec_len) )
-        {
-            ERROR("error loading the HVM context");
-            goto out;
-        }
-        
-        frc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf, rec_len);
-        if ( frc )
-        {
-            ERROR("error setting the HVM context");
-            goto out;
-        }
-
-        /* HVM success! */
-        rc = 0;
-        goto out;
-    }
-
-    /* Non-HVM guests only from here on */
-
     if ( !completed ) {
         int flags = 0;
 
@@ -1407,6 +1550,7 @@ int xc_domain_restore(int xc_handle, int
         goto finish;
     }
     memset(&tmptail, 0, sizeof(tmptail));
+    tmptail.ishvm = hvm;
     if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap,
                      ext_vcpucontext) < 0 ) {
         ERROR ("error buffering image tail, finishing");
@@ -1418,6 +1562,8 @@ int xc_domain_restore(int xc_handle, int
     goto loadpages;
 
   finish:
+    if ( hvm )
+        goto finish_hvm;
 
     if ( (pt_levels == 3) && !pae_extended_cr3 )
     {
@@ -1589,15 +1735,15 @@ int xc_domain_restore(int xc_handle, int
     {
         int nr_frees = 0;
 
-        for ( i = 0; i < tailbuf.pfncount; i++ )
-        {
-            unsigned long pfn = tailbuf.pfntab[i];
+        for ( i = 0; i < tailbuf.u.pv.pfncount; i++ )
+        {
+            unsigned long pfn = tailbuf.u.pv.pfntab[i];
 
             if ( p2m[pfn] != INVALID_P2M_ENTRY )
             {
                 /* pfn is not in physmap now, but was at some point during
                    the save/migration process - need to free it */
-                tailbuf.pfntab[nr_frees++] = p2m[pfn];
+                tailbuf.u.pv.pfntab[nr_frees++] = p2m[pfn];
                 p2m[pfn]  = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
             }
         }
@@ -1609,7 +1755,7 @@ int xc_domain_restore(int xc_handle, int
                 .extent_order = 0,
                 .domid        = dom
             };
-            set_xen_guest_handle(reservation.extent_start, tailbuf.pfntab);
+            set_xen_guest_handle(reservation.extent_start, 
tailbuf.u.pv.pfntab);
 
             if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
                                      &reservation)) != nr_frees )
@@ -1618,7 +1764,7 @@ int xc_domain_restore(int xc_handle, int
                 goto out;
             }
             else
-                DPRINTF("Decreased reservation by %d pages\n", 
tailbuf.pfncount);
+                DPRINTF("Decreased reservation by %d pages\n", 
tailbuf.u.pv.pfncount);
         }
     }
 
@@ -1628,7 +1774,7 @@ int xc_domain_restore(int xc_handle, int
         return 1;
     }
 
-    vcpup = tailbuf.vcpubuf;
+    vcpup = tailbuf.u.pv.vcpubuf;
     for ( i = 0; i <= max_vcpu_id; i++ )
     {
         if ( !(vcpumap & (1ULL << i)) )
@@ -1755,7 +1901,7 @@ int xc_domain_restore(int xc_handle, int
         }
     }
 
-    memcpy(shared_info_page, tailbuf.shared_info_page, PAGE_SIZE);
+    memcpy(shared_info_page, tailbuf.u.pv.shared_info_page, PAGE_SIZE);
 
     DPRINTF("Completed checkpoint load\n");
 
@@ -1812,6 +1958,51 @@ int xc_domain_restore(int xc_handle, int
 
     DPRINTF("Domain ready to be built.\n");
     rc = 0;
+    goto out;
+
+  finish_hvm:
+    /* Dump the QEMU state to a state file for QEMU to load */
+    if ( dump_qemu(dom, &tailbuf.u.hvm) ) {
+        ERROR("Error dumping QEMU state to file");
+        goto out;
+    }
+
+    /* These comms pages need to be zeroed at the start of day */
+    if ( xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[0]) ||
+         xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[1]) ||
+         xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[2]) )
+    {
+        ERROR("error zeroing magic pages");
+        goto out;
+    }
+
+    if ( (frc = xc_set_hvm_param(xc_handle, dom,
+                                 HVM_PARAM_IOREQ_PFN, 
tailbuf.u.hvm.magicpfns[0]))
+         || (frc = xc_set_hvm_param(xc_handle, dom,
+                                    HVM_PARAM_BUFIOREQ_PFN, 
tailbuf.u.hvm.magicpfns[1]))
+         || (frc = xc_set_hvm_param(xc_handle, dom,
+                                    HVM_PARAM_STORE_PFN, 
tailbuf.u.hvm.magicpfns[2]))
+         || (frc = xc_set_hvm_param(xc_handle, dom,
+                                    HVM_PARAM_PAE_ENABLED, pae))
+         || (frc = xc_set_hvm_param(xc_handle, dom,
+                                    HVM_PARAM_STORE_EVTCHN,
+                                    store_evtchn)) )
+    {
+        ERROR("error setting HVM params: %i", frc);
+        goto out;
+    }
+    *store_mfn = tailbuf.u.hvm.magicpfns[2];
+
+    frc = xc_domain_hvm_setcontext(xc_handle, dom, tailbuf.u.hvm.hvmbuf,
+                                   tailbuf.u.hvm.reclen);
+    if ( frc )
+    {
+        ERROR("error setting the HVM context");
+        goto out;
+    }
+
+    /* HVM success! */
+    rc = 0;
 
  out:
     if ( (rc != 0) && (dom != 0) )
@@ -1819,7 +2010,7 @@ int xc_domain_restore(int xc_handle, int
     free(mmu);
     free(p2m);
     free(pfn_type);
-    free(hvm_buf);
+    tailbuf_free(&tailbuf);
 
     /* discard cache for save file  */
     discard_file_cache(io_fd, 1 /*flush*/);
diff -r 01c9cb566f61 -r f6091947ffed tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Mon Nov 09 19:17:22 2009 +0000
+++ b/tools/python/xen/xend/XendCheckpoint.py   Mon Nov 09 19:19:27 2009 +0000
@@ -323,26 +323,6 @@ def restore(xd, fd, dominfo = None, paus
         if not is_hvm and handler.console_mfn is None:
             raise XendError('Could not read console MFN')        
 
-        # get qemu state and create a tmp file for dm restore
-        # Even PV guests may have QEMU stat, but its not currently
-        # used so only bother with HVM currently.
-        if is_hvm:
-            qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
-                                        "invalid device model signature read")
-            if qemu_signature != QEMU_SIGNATURE:
-                raise XendError("not a valid device model state: found '%s'" %
-                                qemu_signature)
-            qemu_fd = os.open("/var/lib/xen/qemu-save.%d" % dominfo.getDomid(),
-                              os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
-            while True:
-                buf = os.read(fd, dm_batch)
-                if len(buf):
-                    write_exact(qemu_fd, buf,
-                                "could not write dm state to tmp file")
-                else:
-                    break
-            os.close(qemu_fd)
-
         restore_image.setCpuid()
 
         # xc_restore will wait for source to close connection

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] Remus: Make checkpoint buffering HVM-aware, Xen patchbot-unstable <=