[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH RFC v2 08/23] libxc/migration: defer precopy policy to a callback



From: Joshua Otto <jtotto@xxxxxxxxxxxx>

The precopy phase of the xc_domain_save() live migration algorithm has
historically been implemented to run until either a) (almost) no pages
are dirty or b) some fixed, hard-coded maximum number of precopy
iterations has been exceeded.  This policy and its implementation are
less than ideal for a few reasons:
- the logic of the policy is intertwined with the control flow of the
  mechanism of the precopy stage
- it can't take into account facts external to the immediate
  migration context, such as interactive user input or the passage of
  wall-clock time

To permit users to implement arbitrary higher-level policies governing
when the live migration precopy phase should end, and what should be
done next:
- add a precopy_policy() callback to the xc_domain_save() user-supplied
  callbacks
- during the precopy phase of live migrations, consult this policy after
  each batch of pages transmitted and take the dictated action, which
  may be to a) abort the migration entirely, b) continue with the
  precopy, or c) proceed to the stop-and-copy phase.

For now a simple callback implementing the old policy is hard-coded in
place (to be replaced in a subsequent patch).

Signed-off-by: Joshua Otto <jtotto@xxxxxxxxxxxx>
---
 tools/libxc/include/xenguest.h   |  20 +++-
 tools/libxc/xc_sr_common.h       |  12 ++-
 tools/libxc/xc_sr_save.c         | 193 ++++++++++++++++++++++++++++-----------
 tools/libxl/libxl_save_callout.c |   2 +-
 tools/libxl/libxl_save_helper.c  |   1 -
 5 files changed, 170 insertions(+), 58 deletions(-)

diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index d1f97b9..215abd0 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -32,6 +32,14 @@
  */
 struct xenevtchn_handle;
 
+/* For save's precopy_policy(). */
+struct precopy_stats
+{
+    unsigned int iteration;
+    unsigned int total_written;
+    int dirty_count; /* -1 if unknown */
+};
+
 /* callbacks provided by xc_domain_save */
 struct save_callbacks {
     /* Called after expiration of checkpoint interval,
@@ -39,6 +47,17 @@ struct save_callbacks {
      */
     int (*suspend)(void* data);
 
+    /* Called after every batch of page data sent during the precopy phase of a
+     * live migration to ask the caller what to do next based on the current
+     * state of the precopy migration.
+     */
+#define XGS_POLICY_ABORT          (-1) /* Abandon the migration entirely and
+                                        * tidy up. */
+#define XGS_POLICY_CONTINUE_PRECOPY 0  /* Remain in the precopy phase. */
+#define XGS_POLICY_STOP_AND_COPY    1  /* Immediately suspend and transmit the
+                                        * remaining dirty pages. */
+    int (*precopy_policy)(struct precopy_stats stats, void *data);
+
     /* Called after the guest's dirty pages have been
      *  copied into an output buffer.
      * Callback function resumes the guest & the device model,
@@ -87,7 +106,6 @@ struct domain_save_params {
     uint32_t dom;       /* the id of the domain */
     int save_fd;        /* the fd to save the domain to */
     int recv_fd;        /* the fd to receive live protocol responses */
-    uint32_t max_iters; /* how many precopy iterations before we give up? */
     bool live;          /* is this a live migration? */
     bool debug;         /* are we in debug mode? */
     xc_migration_stream_t stream_type; /* is there checkpointing involved? */
diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
index f192654..0da0ffc 100644
--- a/tools/libxc/xc_sr_common.h
+++ b/tools/libxc/xc_sr_common.h
@@ -198,12 +198,16 @@ struct xc_sr_context
             /* Further debugging information in the stream. */
             bool debug;
 
-            /* Parameters for tweaking live migration. */
-            unsigned max_iterations;
-            unsigned dirty_threshold;
-
             unsigned long p2m_size;
 
+            enum {
+                XC_SAVE_PHASE_PRECOPY,
+                XC_SAVE_PHASE_STOP_AND_COPY
+            } phase;
+
+            struct precopy_stats stats;
+            int policy_decision;
+
             xen_pfn_t *batch_pfns;
             unsigned nr_batch_pfns;
             unsigned long *deferred_pages;
diff --git a/tools/libxc/xc_sr_save.c b/tools/libxc/xc_sr_save.c
index 0ab86c3..55b77ff 100644
--- a/tools/libxc/xc_sr_save.c
+++ b/tools/libxc/xc_sr_save.c
@@ -277,13 +277,29 @@ static int write_batch(struct xc_sr_context *ctx)
 }
 
 /*
+ * Test if the batch is full.
+ */
+static bool batch_full(const struct xc_sr_context *ctx)
+{
+    return ctx->save.nr_batch_pfns == MAX_BATCH_SIZE;
+}
+
+/*
+ * Test if the batch is empty.
+ */
+static bool batch_empty(struct xc_sr_context *ctx)
+{
+    return ctx->save.nr_batch_pfns == 0;
+}
+
+/*
  * Flush a batch of pfns into the stream.
  */
 static int flush_batch(struct xc_sr_context *ctx)
 {
     int rc = 0;
 
-    if ( ctx->save.nr_batch_pfns == 0 )
+    if ( batch_empty(ctx) )
         return rc;
 
     rc = write_batch(ctx);
@@ -299,19 +315,12 @@ static int flush_batch(struct xc_sr_context *ctx)
 }
 
 /*
- * Add a single pfn to the batch, flushing the batch if full.
+ * Add a single pfn to the batch.
  */
-static int add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn)
+static void add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn)
 {
-    int rc = 0;
-
-    if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE )
-        rc = flush_batch(ctx);
-
-    if ( rc == 0 )
-        ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn;
-
-    return rc;
+    assert(ctx->save.nr_batch_pfns < MAX_BATCH_SIZE);
+    ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn;
 }
 
 /*
@@ -358,43 +367,80 @@ static int suspend_domain(struct xc_sr_context *ctx)
  * Send a subset of pages in the guests p2m, according to the dirty bitmap.
  * Used for each subsequent iteration of the live migration loop.
  *
+ * During the precopy stage of a live migration, test the user-supplied
+ * policy function after each batch of pages and cut off the operation
+ * early if indicated (the dirty pages remaining in this round are transferred
+ * into the deferred_pages bitmap).  This function writes observed precopy
+ * policy decisions to ctx->save.policy_decision; callers must check this upon
+ * return.
+ *
  * Bitmap is bounded by p2m_size.
  */
 static int send_dirty_pages(struct xc_sr_context *ctx,
                             unsigned long entries)
 {
     xc_interface *xch = ctx->xch;
-    xen_pfn_t p;
-    unsigned long written;
+    xen_pfn_t p = 0;
+    unsigned long written = 0;
     int rc;
     DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
                                     &ctx->save.dirty_bitmap_hbuf);
 
-    for ( p = 0, written = 0; p < ctx->save.p2m_size; ++p )
+    int (*precopy_policy)(struct precopy_stats, void *) =
+        ctx->save.callbacks->precopy_policy;
+    void *data = ctx->save.callbacks->data;
+
+    assert(batch_empty(ctx));
+    while ( p < ctx->save.p2m_size )
     {
-        if ( !test_bit(p, dirty_bitmap) )
-            continue;
+        if ( ctx->save.phase == XC_SAVE_PHASE_PRECOPY )
+        {
+            ctx->save.policy_decision = precopy_policy(ctx->save.stats, data);
+
+            if ( ctx->save.policy_decision == XGS_POLICY_ABORT )
+            {
+                IPRINTF("Precopy policy has requested we abort, cleaning up");
+                return -1;
+            }
+            else if ( ctx->save.policy_decision != XGS_POLICY_CONTINUE_PRECOPY 
)
+            {
+                /*
+                 * Any outstanding dirty pages are now deferred until the next
+                 * phase of the migration.
+                 */
+                bitmap_or(ctx->save.deferred_pages, dirty_bitmap,
+                          ctx->save.p2m_size);
+                if ( entries > written )
+                    ctx->save.nr_deferred_pages += entries - written;
+
+                goto done;
+            }
+        }
+
+        for ( ; p < ctx->save.p2m_size && !batch_full(ctx); ++p )
+        {
+            if ( test_and_clear_bit(p, dirty_bitmap) )
+            {
+                add_to_batch(ctx, p);
+                ++written;
+                ++ctx->save.stats.total_written;
+            }
+        }
 
-        rc = add_to_batch(ctx, p);
+        rc = flush_batch(ctx);
         if ( rc )
             return rc;
 
-        /* Update progress every 4MB worth of memory sent. */
-        if ( (written & ((1U << (22 - 12)) - 1)) == 0 )
-            xc_report_progress_step(xch, written, entries);
-
-        ++written;
+        /* Update progress after every batch (4MB) worth of memory sent. */
+        xc_report_progress_step(xch, written, entries);
     }
 
-    rc = flush_batch(ctx);
-    if ( rc )
-        return rc;
-
     if ( written > entries )
         DPRINTF("Bitmap contained more entries than expected...");
 
     xc_report_progress_step(xch, entries, entries);
 
+ done:
     return ctx->save.ops.check_vm_state(ctx);
 }
 
@@ -452,8 +498,7 @@ static int update_progress_string(struct xc_sr_context *ctx,
     xc_interface *xch = ctx->xch;
     char *new_str = NULL;
 
-    if ( asprintf(&new_str, "Frames iteration %u of %u",
-                  iter, ctx->save.max_iterations) == -1 )
+    if ( asprintf(&new_str, "Frames iteration %u", iter) == -1 )
     {
         PERROR("Unable to allocate new progress string");
         return -1;
@@ -474,20 +519,34 @@ static int send_memory_live(struct xc_sr_context *ctx)
     xc_interface *xch = ctx->xch;
     xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
     char *progress_str = NULL;
-    unsigned x;
     int rc;
 
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->save.dirty_bitmap_hbuf);
+
+    int (*precopy_policy)(struct precopy_stats, void *) =
+        ctx->save.callbacks->precopy_policy;
+    void *data = ctx->save.callbacks->data;
+
     rc = update_progress_string(ctx, &progress_str, 0);
     if ( rc )
         goto out;
 
+    ctx->save.stats = (struct precopy_stats)
+        {
+            .iteration     = 0,
+            .total_written = 0,
+            .dirty_count   = -1
+        };
+
+    /* This has the side-effect of priming ctx->save.policy_decision. */
     rc = send_all_pages(ctx);
     if ( rc )
         goto out;
 
-    for ( x = 1;
-          ((x < ctx->save.max_iterations) &&
-           (stats.dirty_count > ctx->save.dirty_threshold)); ++x )
+    for ( ctx->save.stats.iteration = 1;
+          ctx->save.policy_decision == XGS_POLICY_CONTINUE_PRECOPY;
+          ++ctx->save.stats.iteration )
     {
         if ( xc_shadow_control(
                  xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
@@ -499,10 +558,32 @@ static int send_memory_live(struct xc_sr_context *ctx)
             goto out;
         }
 
-        if ( stats.dirty_count == 0 )
-            break;
+        /* Check the new dirty_count against the policy. */
+        ctx->save.stats.dirty_count = stats.dirty_count;
+        ctx->save.policy_decision = precopy_policy(ctx->save.stats, data);
+        if ( ctx->save.policy_decision == XGS_POLICY_ABORT )
+        {
+            IPRINTF("Precopy policy has requested we abort, cleaning up");
+            rc = -1;
+            goto out;
+        }
+        else if ( ctx->save.policy_decision != XGS_POLICY_CONTINUE_PRECOPY )
+        {
+            bitmap_or(ctx->save.deferred_pages, dirty_bitmap,
+                      ctx->save.p2m_size);
+            ctx->save.nr_deferred_pages += stats.dirty_count;
+            rc = 0;
+            goto out;
+        }
 
-        rc = update_progress_string(ctx, &progress_str, x);
+        /*
+         * After this point we won't know how many pages are really dirty until
+         * the next iteration.
+         */
+        ctx->save.stats.dirty_count = -1;
+
+        rc = update_progress_string(ctx, &progress_str,
+                                    ctx->save.stats.iteration);
         if ( rc )
             goto out;
 
@@ -583,6 +664,8 @@ static int suspend_and_send_dirty(struct xc_sr_context *ctx)
     DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
                                     &ctx->save.dirty_bitmap_hbuf);
 
+    ctx->save.phase = XC_SAVE_PHASE_STOP_AND_COPY;
+
     rc = suspend_domain(ctx);
     if ( rc )
         goto out;
@@ -601,7 +684,7 @@ static int suspend_and_send_dirty(struct xc_sr_context *ctx)
     if ( ctx->save.live )
     {
         rc = update_progress_string(ctx, &progress_str,
-                                    ctx->save.max_iterations);
+                                    ctx->save.stats.iteration);
         if ( rc )
             goto out;
     }
@@ -740,6 +823,9 @@ static int setup(struct xc_sr_context *ctx)
     DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
                                     &ctx->save.dirty_bitmap_hbuf);
 
+    ctx->save.phase = ctx->save.live ? XC_SAVE_PHASE_PRECOPY
+                                     : XC_SAVE_PHASE_STOP_AND_COPY;
+
     rc = ctx->save.ops.setup(ctx);
     if ( rc )
         goto err;
@@ -915,6 +1001,17 @@ static int save(struct xc_sr_context *ctx, uint16_t 
guest_type)
     return rc;
 };
 
+static int simple_precopy_policy(struct precopy_stats stats, void *user)
+{
+    if (stats.dirty_count >= 0 && stats.dirty_count < 50)
+        return XGS_POLICY_STOP_AND_COPY;
+
+    if (stats.iteration >= 5)
+        return XGS_POLICY_STOP_AND_COPY;
+
+    return XGS_POLICY_CONTINUE_PRECOPY;
+}
+
 int xc_domain_save(xc_interface *xch, const struct domain_save_params *params,
                    const struct save_callbacks* callbacks)
 {
@@ -924,8 +1021,12 @@ int xc_domain_save(xc_interface *xch, const struct 
domain_save_params *params,
             .fd = params->save_fd,
         };
 
+    /* XXX use this to shim our precopy_policy in before moving it to libxl */
+    struct save_callbacks overridden_callbacks = *callbacks;
+    overridden_callbacks.precopy_policy = simple_precopy_policy;
+
     /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */
-    ctx.save.callbacks = callbacks;
+    ctx.save.callbacks = &overridden_callbacks;
     ctx.save.live  = params->live;
     ctx.save.debug = params->debug;
     ctx.save.checkpointed = params->stream_type;
@@ -936,15 +1037,6 @@ int xc_domain_save(xc_interface *xch, const struct 
domain_save_params *params,
            params->stream_type == XC_MIG_STREAM_REMUS ||
            params->stream_type == XC_MIG_STREAM_COLO);
 
-    /*
-     * TODO: Find some time to better tweak the live migration algorithm.
-     *
-     * These parameters are better than the legacy algorithm especially for
-     * busy guests.
-     */
-    ctx.save.max_iterations = 5;
-    ctx.save.dirty_threshold = 50;
-
     if ( xc_domain_getinfo(xch, params->dom, 1, &ctx.dominfo) != 1 )
     {
         PERROR("Failed to get domain info");
@@ -967,10 +1059,9 @@ int xc_domain_save(xc_interface *xch, const struct 
domain_save_params *params,
 
     ctx.domid = params->dom;
 
-    DPRINTF("fd %d, dom %u, max_iterations %u, dirty_threshold %u, live %d, "
-            "debug %d, type %d, hvm %d", ctx.fd, ctx.domid,
-            ctx.save.max_iterations, ctx.save.dirty_threshold, ctx.save.live,
-            ctx.save.debug, ctx.save.checkpointed, ctx.dominfo.hvm);
+    DPRINTF("fd %d, dom %u, live %d, debug %d, type %d, hvm %d", ctx.fd,
+            ctx.domid, ctx.save.live, ctx.save.debug, ctx.save.checkpointed,
+            ctx.dominfo.hvm);
 
     if ( ctx.dominfo.hvm )
     {
diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
index 0852bcf..4094c0f 100644
--- a/tools/libxl/libxl_save_callout.c
+++ b/tools/libxl/libxl_save_callout.c
@@ -90,7 +90,7 @@ void libxl__xc_domain_save(libxl__egc *egc, 
libxl__domain_save_state *dss,
         libxl__srm_callout_enumcallbacks_save(&shs->callbacks.save.a);
 
     const unsigned long argnums[] = {
-        dss->domid, 0, dss->live, dss->debug, dss->checkpointed_stream, cbflags
+        dss->domid, dss->live, dss->debug, dss->checkpointed_stream, cbflags
     };
 
     shs->ao = ao;
diff --git a/tools/libxl/libxl_save_helper.c b/tools/libxl/libxl_save_helper.c
index 887b6a2..63c8e15 100644
--- a/tools/libxl/libxl_save_helper.c
+++ b/tools/libxl/libxl_save_helper.c
@@ -251,7 +251,6 @@ int main(int argc, char **argv)
         params.save_fd = io_fd = atoi(NEXTARG);
         params.recv_fd =         atoi(NEXTARG);
         params.dom =             strtoul(NEXTARG,0,10);
-        params.max_iters =       strtoul(NEXTARG,0,10);
         params.live =            strtoul(NEXTARG,0,10);
         params.debug =           strtoul(NEXTARG,0,10);
         params.stream_type =     strtoul(NEXTARG,0,10);
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.