[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH RFC v2 15/23] libxc/migration: implement the receiver side of postcopy live migration



From: Joshua Otto <jtotto@xxxxxxxxxxxx>

Add the receive-side logic for a new 'postcopy' phase in the live
migration algorithm.

To support this migration phase:
- Augment the main restore record-processing logic to recognize and
  handle the postcopy-initiation records.
- Add the core logic for the phase, postcopy_restore(), which marks as
  paged-out all pfns reported by the sender as outstanding at the
  beginning of the phase, and subsequently serves as a pager for this
  subset of memory by forwarding paging requests to the migration sender
  and filling the outstanding domain memory as it is received.

The new restore callbacks required for this migration phase are stubbed
in libxl for now, to be replaced in a subsequent patch that adds libxl
support for this migration phase.

Signed-off-by: Joshua Otto <jtotto@xxxxxxxxxxxx>
---
 tools/libxc/include/xenguest.h      |  65 ++-
 tools/libxc/xc_sr_common.h          |  90 +++-
 tools/libxc/xc_sr_restore.c         | 957 ++++++++++++++++++++++++++++++++++--
 tools/libxc/xc_sr_restore_x86_hvm.c |  41 +-
 tools/libxl/libxl_create.c          |  17 +
 tools/libxl/libxl_save_msgs_gen.pl  |   2 +-
 6 files changed, 1113 insertions(+), 59 deletions(-)

diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index a662273..0049723 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -146,35 +146,52 @@ struct restore_callbacks {
      */
     int (*suspend)(void* data);
 
-    /* Called after the secondary vm is ready to resume.
-     * Callback function resumes the guest & the device model,
-     * returns to xc_domain_restore.
-     */
-    int (*aftercopy)(void* data);
+    union {
+        struct {
+            /*
+             * Called upon receipt of the POSTCOPY_TRANSITION record in the
+             * stream to yield control of the stream to the higher layer so 
that
+             * the remaining data needed to resume the domain in the postcopy
+             * phase can be obtained.  Returns as soon as the higher layer is
+             * finished with the stream.
+             *
+             * Returns 1 on success, 0 on failure.
+             */
+            int (*postcopy_transition)(void *data);
+        };
 
-    /* A checkpoint record has been found in the stream.
-     * returns: */
+        struct {
+            /* Called after the secondary vm is ready to resume.
+             * Callback function resumes the guest & the device model,
+             * returns to xc_domain_restore.
+             */
+            int (*aftercopy)(void* data);
+
+            /* A checkpoint record has been found in the stream.
+             * returns: */
 #define XGR_CHECKPOINT_ERROR    0 /* Terminate processing */
 #define XGR_CHECKPOINT_SUCCESS  1 /* Continue reading more data from the 
stream */
 #define XGR_CHECKPOINT_FAILOVER 2 /* Failover and resume VM */
-    int (*checkpoint)(void* data);
-
-    /*
-     * Called after the checkpoint callback.
-     *
-     * returns:
-     * 0: terminate checkpointing gracefully
-     * 1: take another checkpoint
-     */
-    int (*wait_checkpoint)(void* data);
+            int (*checkpoint)(void* data);
 
-    /*
-     * callback to send store gfn and console gfn to xl
-     * if we want to resume vm before xc_domain_save()
-     * exits.
-     */
-    void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn,
-                            void *data);
+            /*
+             * Called after the checkpoint callback.
+             *
+             * returns:
+             * 0: terminate checkpointing gracefully
+             * 1: take another checkpoint
+             */
+            int (*wait_checkpoint)(void* data);
+
+            /*
+             * callback to send store gfn and console gfn to xl
+             * if we want to resume vm before xc_domain_save()
+             * exits.
+             */
+            void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn,
+                                    void *data);
+        };
+    };
 
     /* to be provided as the last argument to each callback function */
     void* data;
diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
index 244c536..d382642 100644
--- a/tools/libxc/xc_sr_common.h
+++ b/tools/libxc/xc_sr_common.h
@@ -3,6 +3,10 @@
 
 #include <stdbool.h>
 
+#include <xenevtchn.h>
+
+#include <xen/vm_event.h>
+
 #include "xg_private.h"
 #include "xg_save_restore.h"
 #include "xc_dom.h"
@@ -238,6 +242,90 @@ struct xc_sr_context
             uint32_t guest_type;
             uint32_t guest_page_size;
 
+            /* Is this a postcopy live migration? */
+            bool postcopy;
+
+            struct xc_sr_restore_paging
+            {
+                xenevtchn_handle *xce_handle;
+                int port;
+                vm_event_back_ring_t back_ring;
+                uint32_t evtchn_port;
+                void *ring_page;
+                void *buffer;
+
+                struct xc_sr_pending_postcopy_request
+                {
+                    xen_pfn_t pfn; /* == INVALID_PFN when not in use */
+
+                    /* As from vm_event_request_t */
+                    uint32_t flags;
+                    uint32_t vcpu_id;
+                } *pending_requests;
+
+                /*
+                 * The total count of outstanding and requested pfns.  The
+                 * postcopy phase is complete when this reaches 0.
+                 */
+                unsigned int nr_pending_pfns;
+
+                /*
+                 * Prior to the receipt of the first POSTCOPY_PFNS record, all
+                 * pfns are 'invalid', meaning that we don't (yet) believe that
+                 * they need to be migrated as part of the postcopy phase.
+                 *
+                 * Pfns received in POSTCOPY_PFNS records become 'outstanding',
+                 * meaning that they must be migrated but haven't yet been
+                 * requested, received or dropped.
+                 *
+                 * A pfn transitions from outstanding to requested when we
+                 * receive a request for it on the paging ring and request it
+                 * from the sender, before having received it.  There is at
+                 * least one valid entry in pending_requests for each requested
+                 * pfn.
+                 *
+                 * A pfn transitions from either outstanding or requested to
+                 * ready when its contents are received.  Responses to all
+                 * previous pager requests for this pfn are pushed at this 
time,
+                 * and subsequent pager requests for this pfn can be responded
+                 * to immediately.
+                 *
+                 * A pfn transitions from outstanding to dropped if we're
+                 * notified on the ring of the drop.  We track this explicitly
+                 * so that we don't panic upon subsequently receiving the
+                 * contents of this page from the sender.
+                 *
+                 * In summary, the per-pfn postcopy state machine is:
+                 *
+                 * invalid -> outstanding -> requested -> ready
+                 *                |                        ^
+                 *                +------------------------+
+                 *                |
+                 *                +------ -> dropped
+                 *
+                 * The state of each pfn is tracked using these four bitmaps.
+                 */
+                unsigned long *outstanding_pfns;
+                unsigned long *requested_pfns;
+                unsigned long *ready_pfns;
+                unsigned long *dropped_pfns;
+
+                /*
+                 * Used to accumulate batches of pfns for which we must forward
+                 * paging requests to the sender.
+                 */
+                uint64_t *request_batch;
+
+                /* For teardown. */
+                bool evtchn_bound, evtchn_opened, paging_enabled, 
buffer_locked;
+
+                /*
+                 * So we can sanity-check the sequence of postcopy records in
+                 * the stream.
+                 */
+                bool ready;
+            } paging;
+
             /* Plain VM, or checkpoints over time. */
             int checkpointed;
 
@@ -261,7 +349,7 @@ struct xc_sr_context
              * INPUT:  evtchn & domid
              * OUTPUT: gfn
              */
-            xen_pfn_t    xenstore_gfn,    console_gfn;
+            xen_pfn_t    xenstore_gfn,    console_gfn,    paging_ring_gfn;
             unsigned int xenstore_evtchn, console_evtchn;
             domid_t      xenstore_domid,  console_domid;
 
diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c
index 51532aa..3aac0f0 100644
--- a/tools/libxc/xc_sr_restore.c
+++ b/tools/libxc/xc_sr_restore.c
@@ -1,6 +1,7 @@
 #include <arpa/inet.h>
 
 #include <assert.h>
+#include <poll.h>
 
 #include "xc_sr_common.h"
 
@@ -78,6 +79,30 @@ static bool pfn_is_populated(const struct xc_sr_context 
*ctx, xen_pfn_t pfn)
     return test_bit(pfn, ctx->restore.populated_pfns);
 }
 
+static int pfn_bitmap_realloc(struct xc_sr_context *ctx, unsigned long 
**bitmap,
+                              size_t old_sz, size_t new_sz)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned long *p;
+
+    assert(bitmap);
+    if ( *bitmap )
+    {
+        p = realloc(*bitmap, new_sz);
+        if ( !p )
+        {
+            ERROR("Failed to realloc restore bitmap");
+            errno = ENOMEM;
+            return -1;
+        }
+
+        memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz);
+        *bitmap = p;
+    }
+
+    return 0;
+}
+
 /*
  * Set a pfn as populated, expanding the tracking structures if needed. To
  * avoid realloc()ing too excessively, the size increased to the nearest power
@@ -85,13 +110,21 @@ static bool pfn_is_populated(const struct xc_sr_context 
*ctx, xen_pfn_t pfn)
  */
 static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
 {
-    xc_interface *xch = ctx->xch;
+    int rc = 0;
 
     if ( pfn > ctx->restore.max_populated_pfn )
     {
         xen_pfn_t new_max;
         size_t old_sz, new_sz;
-        unsigned long *p;
+        unsigned int i;
+        unsigned long **bitmaps[] =
+        {
+            &ctx->restore.populated_pfns,
+            &ctx->restore.paging.outstanding_pfns,
+            &ctx->restore.paging.requested_pfns,
+            &ctx->restore.paging.ready_pfns,
+            &ctx->restore.paging.dropped_pfns
+        };
 
         /* Round up to the nearest power of two larger than pfn, less 1. */
         new_max = pfn;
@@ -106,17 +139,13 @@ static int pfn_set_populated(struct xc_sr_context *ctx, 
xen_pfn_t pfn)
 
         old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1);
         new_sz = bitmap_size(new_max + 1);
-        p = realloc(ctx->restore.populated_pfns, new_sz);
-        if ( !p )
-        {
-            ERROR("Failed to realloc populated bitmap");
-            errno = ENOMEM;
-            return -1;
-        }
 
-        memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz);
+        for ( i = 0; i < ARRAY_SIZE(bitmaps) && !rc; ++i )
+            rc = pfn_bitmap_realloc(ctx, bitmaps[i], old_sz, new_sz);
+
+        if ( rc )
+            return rc;
 
-        ctx->restore.populated_pfns    = p;
         ctx->restore.max_populated_pfn = new_max;
     }
 
@@ -230,25 +259,8 @@ static int process_page_data(struct xc_sr_context *ctx, 
unsigned count,
     {
         ctx->restore.ops.set_page_type(ctx, pfns[i], types[i]);
 
-        switch ( types[i] )
-        {
-        case XEN_DOMCTL_PFINFO_NOTAB:
-
-        case XEN_DOMCTL_PFINFO_L1TAB:
-        case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
-
-        case XEN_DOMCTL_PFINFO_L2TAB:
-        case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
-
-        case XEN_DOMCTL_PFINFO_L3TAB:
-        case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
-
-        case XEN_DOMCTL_PFINFO_L4TAB:
-        case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
-
+        if ( types[i] < XEN_DOMCTL_PFINFO_BROKEN )
             gfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]);
-            break;
-        }
     }
 
     /* Nothing to do? */
@@ -425,6 +437,859 @@ static int handle_page_data(struct xc_sr_context *ctx, 
struct xc_sr_record *rec)
 }
 
 /*
+ * To prepare for entry to the postcopy phase of live migration:
+ * - enable paging on the domain, and set up the paging ring and event channel
+ * - allocate a locked and aligned paging buffer
+ * - allocate the postcopy page bookkeeping structures
+ */
+static int postcopy_paging_setup(struct xc_sr_context *ctx)
+{
+    int rc;
+    unsigned int i;
+    struct xc_sr_restore_paging *paging = &ctx->restore.paging;
+    xc_interface *xch = ctx->xch;
+
+    /* Sanity-check the migration stream. */
+    if ( !ctx->restore.postcopy )
+    {
+        ERROR("Received POSTCOPY_PFNS_BEGIN before POSTCOPY_BEGIN");
+        return -1;
+    }
+
+    paging->ring_page = xc_vm_event_enable(xch, ctx->domid,
+                                           HVM_PARAM_PAGING_RING_PFN,
+                                           &paging->evtchn_port);
+    if ( !paging->ring_page )
+    {
+        PERROR("Failed to enable paging");
+        return -1;
+    }
+    paging->paging_enabled = true;
+
+    paging->xce_handle = xenevtchn_open(NULL, 0);
+    if (!paging->xce_handle )
+    {
+        ERROR("Failed to open paging evtchn");
+        return -1;
+    }
+    paging->evtchn_opened = true;
+
+    rc = xenevtchn_bind_interdomain(paging->xce_handle, ctx->domid,
+                                    paging->evtchn_port);
+    if ( rc < 0 )
+    {
+        ERROR("Failed to bind paging evtchn");
+        return rc;
+    }
+    paging->evtchn_bound = true;
+    paging->port = rc;
+
+    SHARED_RING_INIT((vm_event_sring_t *)paging->ring_page);
+    BACK_RING_INIT(&paging->back_ring, (vm_event_sring_t *)paging->ring_page,
+                   PAGE_SIZE);
+
+    errno = posix_memalign(&paging->buffer, PAGE_SIZE, PAGE_SIZE);
+    if ( errno != 0 )
+    {
+        PERROR("Failed to allocate paging buffer");
+        return -1;
+    }
+
+    rc = mlock(paging->buffer, PAGE_SIZE);
+    if ( rc < 0 )
+    {
+        PERROR("Failed to lock paging buffer");
+        return rc;
+    }
+    paging->buffer_locked = true;
+
+    paging->outstanding_pfns = bitmap_alloc(ctx->restore.max_populated_pfn + 
1);
+    paging->requested_pfns = bitmap_alloc(ctx->restore.max_populated_pfn + 1);
+    paging->ready_pfns = bitmap_alloc(ctx->restore.max_populated_pfn + 1);
+    paging->dropped_pfns = bitmap_alloc(ctx->restore.max_populated_pfn + 1);
+
+    paging->pending_requests = malloc(RING_SIZE(&paging->back_ring) *
+                                      sizeof(*paging->pending_requests));
+    paging->request_batch = malloc(RING_SIZE(&paging->back_ring) *
+                                   sizeof(*paging->request_batch));
+    if ( !paging->outstanding_pfns ||
+         !paging->requested_pfns ||
+         !paging->ready_pfns ||
+         !paging->dropped_pfns ||
+         !paging->pending_requests ||
+         !paging->request_batch )
+    {
+        PERROR("Failed to allocate pfn state tracking buffers");
+        return -1;
+    }
+
+    /* All slots are initially empty. */
+    for ( i = 0; i < RING_SIZE(&paging->back_ring); ++i )
+        paging->pending_requests[i].pfn = INVALID_PFN;
+
+    paging->ready = true;
+
+    return 0;
+}
+
+static void postcopy_paging_cleanup(struct xc_sr_context *ctx)
+{
+    int rc;
+    struct xc_sr_restore_paging *paging = &ctx->restore.paging;
+    xc_interface *xch = ctx->xch;
+
+    if ( paging->ring_page )
+        munmap(paging->ring_page, PAGE_SIZE);
+
+    if ( paging->paging_enabled )
+    {
+        rc = xc_vm_event_control(xch, ctx->domid, XEN_VM_EVENT_DISABLE,
+                                 XEN_DOMCTL_VM_EVENT_OP_PAGING, NULL);
+        if ( rc != 0 )
+            ERROR("Failed to disable paging");
+    }
+
+    if ( paging->evtchn_bound )
+    {
+        rc = xenevtchn_unbind(paging->xce_handle, paging->port);
+        if ( rc != 0 )
+            ERROR("Failed to unbind event port");
+    }
+
+    if ( paging->evtchn_opened )
+    {
+        rc = xenevtchn_close(paging->xce_handle);
+        if ( rc != 0 )
+            ERROR("Failed to close event channel");
+    }
+
+    if ( paging->buffer )
+    {
+        if ( paging->buffer_locked )
+            munlock(paging->buffer, PAGE_SIZE);
+
+        free(paging->buffer);
+    }
+
+    free(paging->outstanding_pfns);
+    free(paging->requested_pfns);
+    free(paging->ready_pfns);
+    free(paging->dropped_pfns);
+    free(paging->pending_requests);
+    free(paging->request_batch);
+}
+
+/* Helpers to query and transition the state of postcopy pfns. */
+
+static inline bool postcopy_pfn_outstanding(struct xc_sr_context *ctx,
+                                            xen_pfn_t pfn)
+{
+    return (pfn <= ctx->restore.max_populated_pfn)
+        ? test_bit(pfn, ctx->restore.paging.outstanding_pfns)
+        : false;
+}
+
+static inline bool postcopy_pfn_requested(struct xc_sr_context *ctx,
+                                            xen_pfn_t pfn)
+{
+    return (pfn <= ctx->restore.max_populated_pfn)
+        ? test_bit(pfn, ctx->restore.paging.requested_pfns)
+        : false;
+}
+
+static inline bool postcopy_pfn_ready(struct xc_sr_context *ctx,
+                                            xen_pfn_t pfn)
+{
+    return (pfn <= ctx->restore.max_populated_pfn)
+        ? test_bit(pfn, ctx->restore.paging.ready_pfns)
+        : false;
+}
+
+static inline bool postcopy_pfn_dropped(struct xc_sr_context *ctx,
+                                            xen_pfn_t pfn)
+{
+    return (pfn <= ctx->restore.max_populated_pfn)
+        ? test_bit(pfn, ctx->restore.paging.dropped_pfns)
+        : false;
+}
+
+static inline bool postcopy_pfn_invalid(struct xc_sr_context *ctx,
+                                        xen_pfn_t pfn)
+{
+    if (pfn > ctx->restore.max_populated_pfn)
+        return false;
+
+    return !postcopy_pfn_outstanding(ctx, pfn) &&
+           !postcopy_pfn_requested(ctx, pfn) &&
+           !postcopy_pfn_ready(ctx, pfn) &&
+           !postcopy_pfn_dropped(ctx, pfn);
+}
+
+static inline void mark_postcopy_pfn_outstanding(struct xc_sr_context *ctx,
+                                                 xen_pfn_t pfn)
+{
+    assert(pfn <= ctx->restore.max_populated_pfn);
+    assert(postcopy_pfn_invalid(ctx, pfn));
+
+    set_bit(pfn, ctx->restore.paging.outstanding_pfns);
+}
+
+static inline void mark_postcopy_pfn_requested(struct xc_sr_context *ctx,
+                                               xen_pfn_t pfn)
+{
+    assert(pfn <= ctx->restore.max_populated_pfn);
+    assert(postcopy_pfn_outstanding(ctx, pfn));
+
+    clear_bit(pfn, ctx->restore.paging.outstanding_pfns);
+    set_bit(pfn, ctx->restore.paging.requested_pfns);
+}
+
+static inline void mark_postcopy_pfn_ready(struct xc_sr_context *ctx,
+                                           xen_pfn_t pfn)
+{
+    assert(pfn <= ctx->restore.max_populated_pfn);
+    assert(postcopy_pfn_outstanding(ctx, pfn) ||
+           postcopy_pfn_requested(ctx, pfn));
+
+    clear_bit(pfn, ctx->restore.paging.outstanding_pfns);
+    clear_bit(pfn, ctx->restore.paging.requested_pfns);
+    set_bit(pfn, ctx->restore.paging.ready_pfns);
+}
+
+static inline void mark_postcopy_pfn_dropped(struct xc_sr_context *ctx,
+                                             xen_pfn_t pfn)
+{
+    assert(pfn <= ctx->restore.max_populated_pfn);
+    assert(postcopy_pfn_outstanding(ctx, pfn));
+
+    clear_bit(pfn, ctx->restore.paging.outstanding_pfns);
+    set_bit(pfn, ctx->restore.paging.dropped_pfns);
+}
+
+static int process_postcopy_pfns(struct xc_sr_context *ctx, unsigned int count,
+                                 xen_pfn_t *pfns, uint32_t *types)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_restore_paging *paging = &ctx->restore.paging;
+    int rc;
+    unsigned int i;
+    xen_pfn_t bpfn;
+
+    rc = populate_pfns(ctx, count, pfns, types);
+    if ( rc )
+    {
+        ERROR("Failed to populate pfns for batch of %u pages", count);
+        goto out;
+    }
+
+    for ( i = 0; i < count; ++i )
+    {
+        if ( types[i] < XEN_DOMCTL_PFINFO_BROKEN )
+        {
+            bpfn = pfns[i];
+
+            /* We should never see the same pfn twice at this stage.  */
+            if ( !postcopy_pfn_invalid(ctx, bpfn) )
+            {
+                rc = -1;
+                ERROR("Duplicate postcopy pfn %"PRI_xen_pfn, bpfn);
+                goto out;
+            }
+
+            /*
+             * We now consider this pfn 'outstanding' - pending, and not yet
+             * requested.
+             */
+            mark_postcopy_pfn_outstanding(ctx, bpfn);
+            ++paging->nr_pending_pfns;
+
+            /*
+             * Neither nomination nor eviction can be permitted to fail - the
+             * guest isn't yet running, so a failure would imply a foreign or
+             * hypervisor mapping on the page, and that would be bogus because
+             * the migration isn't yet complete.
+             */
+            rc = xc_mem_paging_nominate(xch, ctx->domid, bpfn);
+            if ( rc < 0 )
+            {
+                PERROR("Error nominating postcopy pfn %"PRI_xen_pfn, bpfn);
+                goto out;
+            }
+
+            rc = xc_mem_paging_evict(xch, ctx->domid, bpfn);
+            if ( rc < 0 )
+            {
+                PERROR("Error evicting postcopy pfn %"PRI_xen_pfn, bpfn);
+                goto out;
+            }
+        }
+    }
+
+    rc = 0;
+
+ out:
+    return rc;
+}
+
+static int handle_postcopy_pfns(struct xc_sr_context *ctx,
+                                struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_pages_header *pages = rec->data;
+    int rc;
+    xen_pfn_t *pfns = NULL;
+    uint32_t *types = NULL;
+
+    /* Sanity-check the migration stream. */
+    if ( !ctx->restore.paging.ready )
+    {
+        ERROR("Received POSTCOPY_PFNS record before POSTCOPY_PFNS_BEGIN");
+        rc = -1;
+        goto err;
+    }
+
+    rc = validate_pages_record(ctx, rec, REC_TYPE_POSTCOPY_PFNS);
+    if ( rc )
+        goto err;
+
+    pfns = malloc(pages->count * sizeof(*pfns));
+    types = malloc(pages->count * sizeof(*types));
+    if ( !pfns || !types )
+    {
+        rc = -1;
+        ERROR("Unable to allocate enough memory for %u pfns",
+              pages->count);
+        goto err;
+    }
+
+    (void)decode_pages_record(ctx, pages, pfns, types);
+    if ( rc )
+        goto err;
+
+    if ( rec->length != (sizeof(*pages) + (sizeof(uint64_t) * pages->count)) )
+    {
+        ERROR("POSTCOPY_PFNS record wrong size: length %u, expected "
+              "%zu + %zu", rec->length, sizeof(*pages),
+              (sizeof(uint64_t) * pages->count));
+        goto err;
+    }
+
+    rc = process_postcopy_pfns(ctx, pages->count, pfns, types);
+
+ err:
+    free(types);
+    free(pfns);
+
+    return rc;
+}
+
+static int handle_postcopy_transition(struct xc_sr_context *ctx)
+{
+    int rc;
+    xc_interface *xch = ctx->xch;
+    void *data = ctx->restore.callbacks->data;
+
+    /* Sanity-check the migration stream. */
+    if ( !ctx->restore.paging.ready )
+    {
+        ERROR("Received POSTCOPY_TRANSITION record before 
POSTCOPY_PFNS_BEGIN");
+        return -1;
+    }
+
+    rc = ctx->restore.ops.stream_complete(ctx);
+    if ( rc )
+        return rc;
+
+    ctx->restore.callbacks->restore_results(ctx->restore.xenstore_gfn,
+                                            ctx->restore.console_gfn,
+                                            data);
+
+    /*
+     * Asynchronously resume the guest.  We'll return when we've been handed
+     * back control of the stream, so that we can begin filling in the
+     * outstanding postcopy page data and forwarding guest requests for 
specific
+     * pages.
+     */
+    IPRINTF("Postcopy transition: resuming guest");
+    return ctx->restore.callbacks->postcopy_transition(data) ? 0 : -1;
+}
+
+static int postcopy_load_page(struct xc_sr_context *ctx, xen_pfn_t pfn,
+                              void *page_data)
+{
+    int rc;
+    unsigned int i;
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_restore_paging *paging = &ctx->restore.paging;
+    struct xc_sr_pending_postcopy_request *preq;
+    vm_event_response_t rsp;
+    vm_event_back_ring_t *back_ring = &paging->back_ring;
+
+    assert(postcopy_pfn_outstanding(ctx, pfn) ||
+           postcopy_pfn_requested(ctx, pfn));
+
+    memcpy(paging->buffer, page_data, PAGE_SIZE);
+    rc = xc_mem_paging_load(ctx->xch, ctx->domid, pfn, paging->buffer);
+    if ( rc < 0 )
+    {
+        PERROR("Failed to paging load pfn %"PRI_xen_pfn, pfn);
+        return rc;
+    }
+
+    if ( postcopy_pfn_requested(ctx, pfn) )
+    {
+        for ( i = 0; i < RING_SIZE(back_ring); ++i )
+        {
+            preq = &paging->pending_requests[i];
+            if ( preq->pfn != pfn )
+                continue;
+
+            /* Put the response on the ring. */
+            rsp = (vm_event_response_t)
+            {
+                .version = VM_EVENT_INTERFACE_VERSION,
+                .vcpu_id = preq->vcpu_id,
+                .flags   = (preq->flags & VM_EVENT_FLAG_VCPU_PAUSED),
+                .reason  = VM_EVENT_REASON_MEM_PAGING,
+                .u       = { .mem_paging = { .gfn = pfn } }
+            };
+
+            memcpy(RING_GET_RESPONSE(back_ring, back_ring->rsp_prod_pvt),
+                   &rsp, sizeof(rsp));
+                   ++back_ring->rsp_prod_pvt;
+
+            /* And free the pending request slot. */
+            preq->pfn = INVALID_PFN;
+        }
+    }
+
+    --paging->nr_pending_pfns;
+    mark_postcopy_pfn_ready(ctx, pfn);
+    return 0;
+}
+
+static int process_postcopy_page_data(struct xc_sr_context *ctx,
+                                      unsigned int count, xen_pfn_t *pfns,
+                                      uint32_t *types, void *page_data)
+{
+    int rc;
+    unsigned int i;
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_restore_paging *paging = &ctx->restore.paging;
+    bool push_responses = false;
+
+    for ( i = 0; i < count; ++i )
+    {
+        switch ( types[i] )
+        {
+        case XEN_DOMCTL_PFINFO_XTAB:
+        case XEN_DOMCTL_PFINFO_BROKEN:
+        case XEN_DOMCTL_PFINFO_XALLOC:
+            ERROR("Received postcopy pfn %"PRI_xen_pfn
+                  " with invalid type %"PRIu32, pfns[i], types[i]);
+            return -1;
+        default:
+            if ( postcopy_pfn_invalid(ctx, pfns[i]) )
+            {
+                ERROR("Expected pfn %"PRI_xen_pfn" to be invalid", pfns[i]);
+                return -1;
+            }
+            else if ( postcopy_pfn_ready(ctx, pfns[i]) )
+            {
+                ERROR("pfn %"PRI_xen_pfn" already received", pfns[i]);
+                return -1;
+            }
+            else if ( postcopy_pfn_dropped(ctx, pfns[i]) )
+            {
+                /* Nothing to do - move on to the next page. */
+                page_data += PAGE_SIZE;
+            }
+            else
+            {
+                if ( postcopy_pfn_requested(ctx, pfns[i]) )
+                {
+                    DBGPRINTF("Received requested pfn %"PRI_xen_pfn, pfns[i]);
+                    push_responses = true;
+                }
+
+                rc = postcopy_load_page(ctx, pfns[i], page_data);
+                if ( rc )
+                    return rc;
+
+                page_data += PAGE_SIZE;
+            }
+            break;
+        }
+    }
+
+    if ( push_responses )
+    {
+        /*
+         * We put at least one response on the ring as a result of processing
+         * this batch of pages, so we need to push them and kick the ring event
+         * channel.
+         */
+        RING_PUSH_RESPONSES(&paging->back_ring);
+
+        rc = xenevtchn_notify(paging->xce_handle, paging->port);
+        if ( rc )
+        {
+            ERROR("Failed to notify paging event channel");
+            return rc;
+        }
+    }
+
+    return 0;
+}
+
+static int handle_postcopy_page_data(struct xc_sr_context *ctx,
+                                     struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_pages_header *pages = rec->data;
+    unsigned int pages_of_data;
+    int rc = -1;
+
+    xen_pfn_t *pfns = NULL;
+    uint32_t *types = NULL;
+
+    rc = validate_pages_record(ctx, rec, REC_TYPE_POSTCOPY_PAGE_DATA);
+    if ( rc )
+        goto err;
+
+    pfns = malloc(pages->count * sizeof(*pfns));
+    types = malloc(pages->count * sizeof(*types));
+    if ( !pfns || !types )
+    {
+        rc = -1;
+        ERROR("Unable to allocate enough memory for %u pfns",
+              pages->count);
+        goto err;
+    }
+
+    pages_of_data = decode_pages_record(ctx, pages, pfns, types);
+
+    if ( rec->length != (sizeof(*pages) +
+                         (sizeof(uint64_t) * pages->count) +
+                         (PAGE_SIZE * pages_of_data)) )
+    {
+        ERROR("POSTCOPY_PAGE_DATA record wrong size: length %u, expected "
+              "%zu + %zu + %lu", rec->length, sizeof(*pages),
+              (sizeof(uint64_t) * pages->count), (PAGE_SIZE * pages_of_data));
+        goto err;
+    }
+
+    rc = process_postcopy_page_data(ctx, pages->count, pfns, types,
+                                    &pages->pfn[pages->count]);
+ err:
+    free(types);
+    free(pfns);
+
+    return rc;
+}
+
+static int forward_postcopy_paging_requests(struct xc_sr_context *ctx,
+                                            unsigned int nr_batch_requests)
+{
+    struct xc_sr_restore_paging *paging = &ctx->restore.paging;
+    size_t batchsz = nr_batch_requests * sizeof(*paging->request_batch);
+    struct xc_sr_rec_pages_header phdr =
+    {
+        .count = nr_batch_requests
+    };
+    struct xc_sr_record rec =
+    {
+        .type   = REC_TYPE_POSTCOPY_FAULT,
+        .length = sizeof(phdr),
+        .data   = &phdr
+    };
+
+    return write_split_record(ctx, ctx->restore.send_back_fd, &rec,
+                              paging->request_batch, batchsz);
+}
+
+static int handle_postcopy_paging_requests(struct xc_sr_context *ctx)
+{
+    int rc;
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_restore_paging *paging = &ctx->restore.paging;
+    struct xc_sr_pending_postcopy_request *preq;
+    vm_event_back_ring_t *back_ring = &paging->back_ring;
+    vm_event_request_t req;
+    vm_event_response_t rsp;
+    xen_pfn_t pfn;
+    bool put_responses = false, drop_requested;
+    unsigned int i, nr_batch_requests = 0;
+
+    while ( RING_HAS_UNCONSUMED_REQUESTS(back_ring) )
+    {
+        RING_COPY_REQUEST(back_ring, back_ring->req_cons, &req);
+        ++back_ring->req_cons;
+
+        drop_requested = !!(req.u.mem_paging.flags & MEM_PAGING_DROP_PAGE);
+        pfn = req.u.mem_paging.gfn;
+
+        DBGPRINTF("Postcopy page fault! %"PRI_xen_pfn, pfn);
+
+        if ( postcopy_pfn_invalid(ctx, pfn) )
+        {
+            ERROR("pfn %"PRI_xen_pfn" does not need to be migrated", pfn);
+            rc = -1;
+            goto err;
+        }
+        else if ( postcopy_pfn_ready(ctx, pfn) || drop_requested )
+        {
+            if ( drop_requested )
+            {
+                if ( postcopy_pfn_outstanding(ctx, pfn) )
+                {
+                    mark_postcopy_pfn_dropped(ctx, pfn);
+                    --paging->nr_pending_pfns;
+                }
+                else
+                {
+                    ERROR("Pager requesting we drop non-paged "
+                          "(or previously-requested) pfn %"PRI_xen_pfn, pfn);
+                    rc = -1;
+                    goto err;
+                }
+            }
+
+            /*
+             * This page has already been loaded (or has been dropped), so we
+             * can respond immediately.
+             */
+            rsp = (vm_event_response_t)
+            {
+                .version = VM_EVENT_INTERFACE_VERSION,
+                .vcpu_id = req.vcpu_id,
+                .flags   = (req.flags & VM_EVENT_FLAG_VCPU_PAUSED),
+                .reason  = VM_EVENT_REASON_MEM_PAGING,
+                .u       = { .mem_paging = { .gfn = pfn } }
+            };
+
+            memcpy(RING_GET_RESPONSE(back_ring, back_ring->rsp_prod_pvt),
+                   &rsp, sizeof(rsp));
+                   ++back_ring->rsp_prod_pvt;
+
+                       put_responses = true;
+        }
+        else /* implies not dropped AND either outstanding or requested */
+        {
+            if ( postcopy_pfn_outstanding(ctx, pfn) )
+            {
+                /* This is the first time this pfn has been requested. */
+                mark_postcopy_pfn_requested(ctx, pfn);
+
+                paging->request_batch[nr_batch_requests] = pfn;
+                ++nr_batch_requests;
+            }
+
+            /* Find a free pending_requests slot. */
+            for ( i = 0; i < RING_SIZE(back_ring); ++i )
+            {
+                preq = &paging->pending_requests[i];
+                if ( preq->pfn == INVALID_PFN )
+                {
+                    /* Claim this slot. */
+                    preq->pfn = pfn;
+
+                    preq->flags = req.flags;
+                    preq->vcpu_id = req.vcpu_id;
+                    break;
+                }
+            }
+
+            /*
+             * We _must_ find a free slot - there cannot be more outstanding
+             * requests than there are slots in the ring.
+             */
+            assert(i < RING_SIZE(back_ring));
+        }
+    }
+
+    if ( put_responses )
+    {
+        RING_PUSH_RESPONSES(back_ring);
+
+        rc = xenevtchn_notify(paging->xce_handle, paging->port);
+        if ( rc )
+        {
+            ERROR("Failed to notify paging event channel");
+            goto err;
+        }
+    }
+
+    if ( nr_batch_requests )
+    {
+        rc = forward_postcopy_paging_requests(ctx, nr_batch_requests);
+        if ( rc )
+        {
+            ERROR("Failed to forward postcopy paging requests");
+            goto err;
+        }
+    }
+
+    rc = 0;
+
+ err:
+    return rc;
+}
+
+static int write_postcopy_complete_record(struct xc_sr_context *ctx)
+{
+    struct xc_sr_record rec = { REC_TYPE_POSTCOPY_COMPLETE };
+
+    return write_record(ctx, ctx->restore.send_back_fd, &rec);
+}
+
+static int postcopy_restore(struct xc_sr_context *ctx)
+{
+    int rc;
+    int recv_fd = ctx->fd;
+    int old_flags;
+    int port;
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_restore_paging *paging = &ctx->restore.paging;
+    struct xc_sr_read_record_context rrctx;
+    struct xc_sr_record rec = { 0, 0, NULL };
+    struct pollfd pfds[] =
+    {
+        { .fd = xenevtchn_fd(paging->xce_handle), .events = POLLIN },
+        { .fd = recv_fd,                          .events = POLLIN }
+    };
+
+    assert(ctx->restore.postcopy);
+    assert(paging->xce_handle);
+
+    read_record_init(&rrctx, ctx);
+
+    /*
+     * For the duration of the postcopy loop, configuring the receive stream as
+     * non-blocking.
+     */
+    old_flags = fcntl(recv_fd, F_GETFL);
+    if ( old_flags == -1 )
+    {
+        rc = old_flags;
+        goto err;
+    }
+
+    assert(!(old_flags & O_NONBLOCK));
+
+    rc = fcntl(recv_fd, F_SETFL, old_flags | O_NONBLOCK);
+    if ( rc == -1 )
+    {
+        goto err;
+    }
+
+    while ( paging->nr_pending_pfns )
+    {
+        rc = poll(pfds, ARRAY_SIZE(pfds), -1);
+        if ( rc < 0 )
+        {
+            if ( errno == EINTR )
+                continue;
+
+            PERROR("Failed to poll the pager event channel/restore stream");
+            goto err;
+        }
+
+        /*
+         * Fill in any newly received page data first, on the off chance that
+         * new pager requests are for that data.
+         */
+        if ( pfds[1].revents & POLLIN )
+        {
+            rc = try_read_record(&rrctx, recv_fd, &rec);
+            if ( rc && (errno != EAGAIN) && (errno != EWOULDBLOCK) )
+            {
+                goto err;
+            }
+            else if ( !rc )
+            {
+                read_record_destroy(&rrctx);
+                read_record_init(&rrctx, ctx);
+
+                rc = handle_postcopy_page_data(ctx, &rec);
+                if ( rc )
+                    goto err;
+
+                free(rec.data);
+                rec.data = NULL;
+            }
+        }
+
+        if ( pfds[0].revents & POLLIN )
+        {
+            port = xenevtchn_pending(paging->xce_handle);
+            if ( port == -1 )
+            {
+                ERROR("Failed to read port from pager event channel");
+                rc = -1;
+                goto err;
+            }
+
+            rc = xenevtchn_unmask(paging->xce_handle, port);
+            if ( rc != 0 )
+            {
+                ERROR("Failed to unmask pager event channel port");
+                goto err;
+            }
+
+            rc = handle_postcopy_paging_requests(ctx);
+            if ( rc )
+                goto err;
+        }
+    }
+
+    /*
+     * At this point, all outstanding postcopy pages have been loaded.  We now
+     * need only flush any outstanding requests that may have accumulated in 
the
+     * ring while we were processing the final POSTCOPY_PAGE_DATA records.
+     */
+    rc = handle_postcopy_paging_requests(ctx);
+    if ( rc )
+        goto err;
+
+    rc = write_postcopy_complete_record(ctx);
+    if ( rc )
+        goto err;
+
+    /*
+     * End-of-stream synchronization: make the receive stream blocking again,
+     * and wait to receive what must be the END record.
+     */
+    rc = fcntl(recv_fd, F_SETFL, old_flags);
+    if ( rc == -1 )
+        goto err;
+
+    rc = read_record(ctx, recv_fd, &rec);
+    if ( rc )
+    {
+        goto err;
+    }
+    else if ( rec.type != REC_TYPE_END )
+    {
+        ERROR("Expected end of stream, received %s", 
rec_type_to_str(rec.type));
+        rc = -1;
+        goto err;
+    }
+
+ err:
+    /*
+     * If _we_ fail here, we can't safely synchronize with the completion of
+     * domain resumption because it might be waiting for us (to fulfill a pager
+     * request).  Since we therefore can't know whether or not the domain was
+     * unpaused, just abruptly bail and let the sender assume the worst.
+     */
+    free(rec.data);
+    read_record_destroy(&rrctx);
+
+    return rc;
+}
+
+/*
  * Send checkpoint dirty pfn list to primary.
  */
 static int send_checkpoint_dirty_pfn_list(struct xc_sr_context *ctx)
@@ -643,6 +1508,25 @@ static int process_record(struct xc_sr_context *ctx, 
struct xc_sr_record *rec)
         rc = handle_checkpoint(ctx);
         break;
 
+    case REC_TYPE_POSTCOPY_BEGIN:
+        if ( ctx->restore.postcopy )
+            rc = -1;
+        else
+            ctx->restore.postcopy = true;
+        break;
+
+    case REC_TYPE_POSTCOPY_PFNS_BEGIN:
+        rc = postcopy_paging_setup(ctx);
+        break;
+
+    case REC_TYPE_POSTCOPY_PFNS:
+        rc = handle_postcopy_pfns(ctx, rec);
+        break;
+
+    case REC_TYPE_POSTCOPY_TRANSITION:
+        rc = handle_postcopy_transition(ctx);
+        break;
+
     default:
         rc = ctx->restore.ops.process_record(ctx, rec);
         break;
@@ -715,6 +1599,10 @@ static void cleanup(struct xc_sr_context *ctx)
     if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
         xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
                                    
NRPAGES(bitmap_size(ctx->restore.p2m_size)));
+
+    if ( ctx->restore.postcopy )
+        postcopy_paging_cleanup(ctx);
+
     free(ctx->restore.buffered_records);
     free(ctx->restore.populated_pfns);
     if ( ctx->restore.ops.cleanup(ctx) )
@@ -777,7 +1665,8 @@ static int restore(struct xc_sr_context *ctx)
                 goto err;
         }
 
-    } while ( rec.type != REC_TYPE_END );
+    } while ( rec.type != REC_TYPE_END &&
+              rec.type != REC_TYPE_POSTCOPY_TRANSITION );
 
  remus_failover:
 
@@ -788,6 +1677,14 @@ static int restore(struct xc_sr_context *ctx)
         IPRINTF("COLO Failover");
         goto done;
     }
+    else if ( ctx->restore.postcopy )
+    {
+        rc = postcopy_restore(ctx);
+        if ( rc )
+            goto err;
+
+        goto done;
+    }
 
     /*
      * With Remus, if we reach here, there must be some error on primary,
diff --git a/tools/libxc/xc_sr_restore_x86_hvm.c 
b/tools/libxc/xc_sr_restore_x86_hvm.c
index 1dca853..5a6b5bf 100644
--- a/tools/libxc/xc_sr_restore_x86_hvm.c
+++ b/tools/libxc/xc_sr_restore_x86_hvm.c
@@ -27,6 +27,27 @@ static int handle_hvm_context(struct xc_sr_context *ctx,
     return 0;
 }
 
+static int handle_hvm_magic_page(struct xc_sr_context *ctx,
+                                 struct xc_sr_rec_hvm_params_entry *entry)
+{
+    int rc;
+    xen_pfn_t pfn = entry->value;
+
+    if ( ctx->restore.postcopy )
+    {
+        rc = populate_pfns(ctx, 1, &pfn, NULL);
+        if ( rc )
+            return rc;
+    }
+
+    if ( entry->index != HVM_PARAM_PAGING_RING_PFN )
+    {
+        xc_clear_domain_page(ctx->xch, ctx->domid, pfn);
+    }
+
+    return 0;
+}
+
 /*
  * Process an HVM_PARAMS record from the stream.
  */
@@ -71,18 +92,32 @@ static int handle_hvm_params(struct xc_sr_context *ctx,
         {
         case HVM_PARAM_CONSOLE_PFN:
             ctx->restore.console_gfn = entry->value;
-            xc_clear_domain_page(xch, ctx->domid, entry->value);
+            rc = handle_hvm_magic_page(ctx, entry);
             break;
         case HVM_PARAM_STORE_PFN:
             ctx->restore.xenstore_gfn = entry->value;
-            xc_clear_domain_page(xch, ctx->domid, entry->value);
+            rc = handle_hvm_magic_page(ctx, entry);
+            break;
+        case HVM_PARAM_PAGING_RING_PFN:
+            ctx->restore.paging_ring_gfn = entry->value;
+            rc = handle_hvm_magic_page(ctx, entry);
             break;
         case HVM_PARAM_IOREQ_PFN:
         case HVM_PARAM_BUFIOREQ_PFN:
-            xc_clear_domain_page(xch, ctx->domid, entry->value);
+            rc = handle_hvm_magic_page(ctx, entry);
+            break;
+        default:
+            rc = 0;
             break;
         }
 
+        if ( rc )
+        {
+            PERROR("populate/clear magic HVM page %"PRId64" = 0x%016"PRIx64,
+                   entry->index, entry->value);
+            return rc;
+        }
+
         rc = xc_hvm_param_set(xch, ctx->domid, entry->index, entry->value);
         if ( rc < 0 )
         {
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index bffbc45..1354689 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -748,6 +748,8 @@ static void domcreate_bootloader_done(libxl__egc *egc,
                                       libxl__bootloader_state *bl,
                                       int rc);
 
+static void domcreate_postcopy_transition_callback(void *user);
+
 static void domcreate_launch_dm(libxl__egc *egc, libxl__multidev *aodevs,
                                 int ret);
 
@@ -1102,6 +1104,13 @@ static void domcreate_bootloader_done(libxl__egc *egc,
             libxl__remus_restore_setup(egc, dcs);
             /* fall through */
         case LIBXL_CHECKPOINTED_STREAM_NONE:
+            /*
+             * When the restore helper initiates the postcopy transition, pick
+             * up in domcreate_postcopy_transition_callback()
+             */
+            callbacks->postcopy_transition =
+                domcreate_postcopy_transition_callback;
+
             libxl__stream_read_start(egc, &dcs->srs);
         }
         return;
@@ -1111,6 +1120,14 @@ static void domcreate_bootloader_done(libxl__egc *egc,
     domcreate_stream_done(egc, &dcs->srs, rc);
 }
 
+/* ----- postcopy live migration ----- */
+
+static void domcreate_postcopy_transition_callback(void *user)
+{
+    /* XXX we're not ready to deal with this yet */
+    assert(0);
+}
+
 void libxl__srm_callout_callback_restore_results(xen_pfn_t store_mfn,
           xen_pfn_t console_mfn, void *user)
 {
diff --git a/tools/libxl/libxl_save_msgs_gen.pl 
b/tools/libxl/libxl_save_msgs_gen.pl
index 5647b97..7f59e03 100755
--- a/tools/libxl/libxl_save_msgs_gen.pl
+++ b/tools/libxl/libxl_save_msgs_gen.pl
@@ -34,7 +34,7 @@ our @msgs = (
     [  9, 'srW',    "complete",              [qw(int retval
                                                  int errnoval)] ],
     [ 10, 'scxW',   "precopy_policy", ['struct precopy_stats', 'stats'] ],
-    [ 11, 'scxA',   "postcopy_transition", [] ]
+    [ 11, 'srcxA',  "postcopy_transition", [] ]
 );
 
 #----------------------------------------
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.