[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v5 RFC 13/14] tools/libxc: noarch save code



At 06/12/2014 02:14 AM, Andrew Cooper Wrote:
> Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> Signed-off-by: Frediano Ziglio <frediano.ziglio@xxxxxxxxxx>
> Signed-off-by: David Vrabel <david.vrabel@xxxxxxxxxx>
> ---
>  tools/libxc/saverestore/save.c |  545 
> +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 544 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/libxc/saverestore/save.c b/tools/libxc/saverestore/save.c
> index f6ad734..9ad43a5 100644
> --- a/tools/libxc/saverestore/save.c
> +++ b/tools/libxc/saverestore/save.c
> @@ -1,11 +1,554 @@
> +#include <assert.h>
> +#include <arpa/inet.h>
> +
>  #include "common.h"
>  
> +/*
> + * Writes an Image header and Domain header into the stream.
> + */
> +static int write_headers(struct context *ctx, uint16_t guest_type)
> +{
> +    xc_interface *xch = ctx->xch;
> +    int32_t xen_version = xc_version(xch, XENVER_version, NULL);
> +    struct ihdr ihdr =
> +        {
> +            .marker  = IHDR_MARKER,
> +            .id      = htonl(IHDR_ID),
> +            .version = htonl(IHDR_VERSION),
> +            .options = htons(IHDR_OPT_LITTLE_ENDIAN),
> +        };
> +    struct dhdr dhdr =
> +        {
> +            .type       = guest_type,
> +            .page_shift = XC_PAGE_SHIFT,
> +            .xen_major  = (xen_version >> 16) & 0xffff,
> +            .xen_minor  = (xen_version)       & 0xffff,
> +        };
> +
> +    if ( xen_version < 0 )
> +    {
> +        PERROR("Unable to obtain Xen Version");
> +        return -1;
> +    }
> +
> +    if ( write_exact(ctx->fd, &ihdr, sizeof(ihdr)) )
> +    {
> +        PERROR("Unable to write Image Header to stream");
> +        return -1;
> +    }
> +
> +    if ( write_exact(ctx->fd, &dhdr, sizeof(dhdr)) )
> +    {
> +        PERROR("Unable to write Domain Header to stream");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Writes an END record into the stream.
> + */
> +static int write_end_record(struct context *ctx)
> +{
> +    struct record end = { REC_TYPE_END, 0, NULL };
> +
> +    return write_record(ctx, &end);
> +}
> +
> +/*
> + * Writes a batch of memory as a PAGE_DATA record into the stream.  The batch
> + * is constructed in ctx->save.batch_pfns.
> + *
> + * This function:
> + * - gets the types for each pfn in the batch.
> + * - for each pfn with real data:
> + *   - maps and attempts to localise the pages.
> + * - construct and writes a PAGE_DATA record into the stream.
> + */
> +static int write_batch(struct context *ctx)
> +{
> +    xc_interface *xch = ctx->xch;
> +    xen_pfn_t *mfns = NULL, *types = NULL;
> +    void *guest_mapping = NULL;
> +    void **guest_data = NULL;
> +    void **local_pages = NULL;
> +    int *errors = NULL, rc = -1;
> +    unsigned i, p, nr_pages = 0;
> +    unsigned nr_pfns = ctx->save.nr_batch_pfns;
> +    void *page, *orig_page;
> +    uint64_t *rec_pfns = NULL;
> +    struct rec_page_data_header hdr = { 0 };
> +    struct record rec =
> +    {
> +        .type = REC_TYPE_PAGE_DATA,
> +    };
> +
> +    assert(nr_pfns != 0);
> +
> +    /* Mfns of the batch pfns. */
> +    mfns = malloc(nr_pfns * sizeof(*mfns));
> +    /* Types of the batch pfns. */
> +    types = malloc(nr_pfns * sizeof(*types));
> +    /* Errors from attempting to map the mfns. */
> +    errors = malloc(nr_pfns * sizeof(*errors));
> +    /* Pointers to page data to send.  Either mapped mfns or local 
> allocations. */
> +    guest_data = calloc(nr_pfns, sizeof(*guest_data));
> +    /* Pointers to locally allocated pages.  Need freeing. */
> +    local_pages = calloc(nr_pfns, sizeof(*local_pages));

This function is called too many times, so we will allocate/free
memory again and again. It may affect the performance.

I think we can allocate at setup stage, and only clear guest_data/
local_pages here.

> +
> +    if ( !mfns || !types || !errors || !guest_data || !local_pages )
> +    {
> +        ERROR("Unable to allocate arrays for a batch of %u pages",
> +              nr_pfns);
> +        goto err;
> +    }
> +
> +    for ( i = 0; i < nr_pfns; ++i )
> +    {
> +        types[i] = mfns[i] = ctx->ops.pfn_to_gfn(ctx, 
> ctx->save.batch_pfns[i]);
> +
> +        /* Likely a ballooned page. */
> +        if ( mfns[i] == INVALID_MFN )
> +            set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
> +    }
> +
> +    rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types);
> +    if ( rc )
> +    {
> +        PERROR("Failed to get types for pfn batch");
> +        goto err;
> +    }
> +    rc = -1;
> +
> +    for ( i = 0; i < nr_pfns; ++i )
> +    {
> +        switch ( types[i] )
> +        {
> +        case XEN_DOMCTL_PFINFO_BROKEN:
> +        case XEN_DOMCTL_PFINFO_XALLOC:
> +        case XEN_DOMCTL_PFINFO_XTAB:
> +            continue;
> +        }
> +
> +        mfns[nr_pages++] = mfns[i];
> +    }
> +
> +    if ( nr_pages > 0 )
> +    {
> +        guest_mapping = xc_map_foreign_bulk(
> +            xch, ctx->domid, PROT_READ, mfns, errors, nr_pages);
> +        if ( !guest_mapping )
> +        {
> +            PERROR("Failed to map guest pages");
> +            goto err;
> +        }

To support remus, we will map/unmap guest memory again and again. It
also affects the performance. We can cache guest mapping here.

Thanks
Wen Congyang

> +    }
> +
> +    for ( i = 0, p = 0; i < nr_pfns; ++i )
> +    {
> +        switch ( types[i] )
> +        {
> +        case XEN_DOMCTL_PFINFO_BROKEN:
> +        case XEN_DOMCTL_PFINFO_XALLOC:
> +        case XEN_DOMCTL_PFINFO_XTAB:
> +            continue;
> +        }
> +
> +        if ( errors[p] )
> +        {
> +            ERROR("Mapping of pfn %#lx (mfn %#lx) failed %d",
> +                  ctx->save.batch_pfns[i], mfns[p], errors[p]);
> +            goto err;
> +        }
> +
> +        orig_page = page = guest_mapping + (p * PAGE_SIZE);
> +        rc = ctx->save.ops.normalise_page(ctx, types[i], &page);
> +        if ( rc )
> +        {
> +            if ( rc == -1 && errno == EAGAIN )
> +            {
> +                set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
> +                types[i] = XEN_DOMCTL_PFINFO_XTAB;
> +                --nr_pages;
> +            }
> +            else
> +                goto err;
> +        }
> +        else
> +            guest_data[i] = page;
> +
> +        if ( page != orig_page )
> +            local_pages[i] = page;
> +        rc = -1;
> +
> +        ++p;
> +    }
> +
> +    rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns));
> +    if ( !rec_pfns )
> +    {
> +        ERROR("Unable to allocate %zu bytes of memory for page data pfn 
> list",
> +              nr_pfns * sizeof(*rec_pfns));
> +        goto err;
> +    }
> +
> +    hdr.count = nr_pfns;
> +
> +    rec.length = sizeof(hdr);
> +    rec.length += nr_pfns * sizeof(*rec_pfns);
> +    rec.length += nr_pages * PAGE_SIZE;
> +
> +    for ( i = 0; i < nr_pfns; ++i )
> +        rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i];
> +
> +    if ( write_record_header(ctx, &rec) ||
> +         write_exact(ctx->fd, &hdr, sizeof(hdr)) ||
> +         write_exact(ctx->fd, rec_pfns, nr_pfns * sizeof(*rec_pfns)) )
> +    {
> +        PERROR("Failed to write page_type header to stream");
> +        goto err;
> +    }
> +
> +    for ( i = 0; i < nr_pfns; ++i )
> +    {
> +        if ( guest_data[i] )
> +        {
> +            if ( write_exact(ctx->fd, guest_data[i], PAGE_SIZE) )
> +            {
> +                PERROR("Failed to write page into stream");
> +                goto err;
> +            }
> +
> +            --nr_pages;
> +        }
> +    }
> +
> +    /* Sanity check we have sent all the pages we expected to. */
> +    assert(nr_pages == 0);
> +    rc = ctx->save.nr_batch_pfns = 0;
> +
> + err:
> +    free(rec_pfns);
> +    if ( guest_mapping )
> +        munmap(guest_mapping, nr_pages * PAGE_SIZE);
> +    for ( i = 0; local_pages && i < nr_pfns; ++i )
> +            free(local_pages[i]);
> +    free(local_pages);
> +    free(guest_data);
> +    free(errors);
> +    free(types);
> +    free(mfns);
> +
> +    return rc;
> +}
> +
> +/*
> + * Flush a batch of pfns into the stream.
> + */
> +static int flush_batch(struct context *ctx)
> +{
> +    int rc = 0;
> +
> +    if ( ctx->save.nr_batch_pfns == 0 )
> +        return rc;
> +
> +    rc = write_batch(ctx);
> +
> +    if ( !rc )
> +    {
> +        VALGRIND_MAKE_MEM_UNDEFINED(ctx->save.batch_pfns,
> +                                    MAX_BATCH_SIZE * 
> sizeof(*ctx->save.batch_pfns));
> +    }
> +
> +    return rc;
> +}
> +
> +/*
> + * Add a single pfn to the batch, flushing the batch if full.
> + */
> +static int add_to_batch(struct context *ctx, xen_pfn_t pfn)
> +{
> +    int rc = 0;
> +
> +    if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE )
> +        rc = flush_batch(ctx);
> +
> +    if ( rc == 0 )
> +        ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn;
> +
> +    return rc;
> +}
> +
> +/*
> + * Pause the domain.
> + */
> +static int pause_domain(struct context *ctx)
> +{
> +    xc_interface *xch = ctx->xch;
> +    int rc;
> +
> +    if ( !ctx->dominfo.paused )
> +    {
> +        /* TODO: Properly specify the return value from this callback. */
> +        rc = (ctx->save.callbacks->suspend(ctx->save.callbacks->data) != 1);
> +        if ( rc )
> +        {
> +            ERROR("Failed to suspend domain");
> +            return rc;
> +        }
> +    }
> +
> +    IPRINTF("Domain now paused");
> +    return 0;
> +}
> +
> +/*
> + * Send all domain memory.  This is the heart of the live migration loop.
> + */
> +static int send_domain_memory(struct context *ctx)
> +{
> +    xc_interface *xch = ctx->xch;
> +    DECLARE_HYPERCALL_BUFFER(unsigned long, to_send);
> +    xc_shadow_op_stats_t stats = { -1, -1 };
> +    unsigned pages_written;
> +    unsigned x, max_iter = 5, dirty_threshold = 50;
> +    xen_pfn_t p;
> +    int rc = -1;
> +
> +    to_send = xc_hypercall_buffer_alloc_pages(
> +        xch, to_send, NRPAGES(bitmap_size(ctx->save.p2m_size)));
> +
> +    ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE * 
> sizeof(*ctx->save.batch_pfns));
> +    ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size));
> +
> +    if ( !ctx->save.batch_pfns || !to_send || !ctx->save.deferred_pages )
> +    {
> +        ERROR("Unable to allocate memory for to_{send,fix}/batch bitmaps");
> +        goto out;
> +    }
> +
> +    if ( xc_shadow_control(xch, ctx->domid,
> +                           XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
> +                           NULL, 0, NULL, 0, NULL) < 0 )
> +    {
> +        PERROR("Failed to enable logdirty");
> +        goto out;
> +    }
> +
> +    for ( x = 0, pages_written = 0; x < max_iter ; ++x )
> +    {
> +        if ( x == 0 )
> +        {
> +            /* First iteration, send all pages. */
> +            memset(to_send, 0xff, bitmap_size(ctx->save.p2m_size));
> +        }
> +        else
> +        {
> +            /* Else consult the dirty bitmap. */
> +            if ( xc_shadow_control(
> +                     xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
> +                     HYPERCALL_BUFFER(to_send), ctx->save.p2m_size,
> +                     NULL, 0, &stats) != ctx->save.p2m_size )
> +            {
> +                PERROR("Failed to retrieve logdirty bitmap");
> +                rc = -1;
> +                goto out;
> +            }
> +            else
> +                DPRINTF("  Wrote %u pages; stats: faults %"PRIu32", dirty 
> %"PRIu32,
> +                        pages_written, stats.fault_count, stats.dirty_count);
> +            pages_written = 0;
> +
> +            if ( stats.dirty_count < dirty_threshold )
> +                break;
> +        }
> +
> +        DPRINTF("Iteration %u", x);
> +
> +        for ( p = 0 ; p < ctx->save.p2m_size; ++p )
> +        {
> +            if ( test_bit(p, to_send) )
> +            {
> +                rc = add_to_batch(ctx, p);
> +                if ( rc )
> +                    goto out;
> +                ++pages_written;
> +            }
> +        }
> +
> +        rc = flush_batch(ctx);
> +        if ( rc )
> +            goto out;
> +    }
> +
> +    rc = pause_domain(ctx);
> +    if ( rc )
> +        goto out;
> +
> +    if ( xc_shadow_control(
> +             xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
> +             HYPERCALL_BUFFER(to_send), ctx->save.p2m_size,
> +             NULL, 0, &stats) != ctx->save.p2m_size )
> +    {
> +        PERROR("Failed to retrieve logdirty bitmap");
> +        rc = -1;
> +        goto out;
> +    }
> +
> +    for ( p = 0, pages_written = 0 ; p < ctx->save.p2m_size; ++p )
> +    {
> +        if ( test_bit(p, to_send) || test_bit(p, ctx->save.deferred_pages) )
> +        {
> +            rc = add_to_batch(ctx, p);
> +            if ( rc )
> +                goto out;
> +            ++pages_written;
> +        }
> +    }
> +
> +    rc = flush_batch(ctx);
> +    if ( rc )
> +        goto out;
> +
> +    DPRINTF("  Wrote %u pages", pages_written);
> +    IPRINTF("Sent all pages");
> +
> +  out:
> +    xc_hypercall_buffer_free_pages(xch, to_send,
> +                                   NRPAGES(bitmap_size(ctx->save.p2m_size)));
> +    free(ctx->save.deferred_pages);
> +    free(ctx->save.batch_pfns);
> +    return rc;
> +}
> +
> +/*
> + * Save a domain.
> + */
> +static int save(struct context *ctx, uint16_t guest_type)
> +{
> +    xc_interface *xch = ctx->xch;
> +    int rc, saved_rc = 0, saved_errno = 0;
> +
> +    IPRINTF("Saving domain %d, type %s",
> +            ctx->domid, dhdr_type_to_str(guest_type));
> +
> +    rc = ctx->save.ops.setup(ctx);
> +    if ( rc )
> +        goto err;
> +
> +    rc = write_headers(ctx, guest_type);
> +    if ( rc )
> +        goto err;
> +
> +    rc = ctx->save.ops.start_of_stream(ctx);
> +    if ( rc )
> +        goto err;
> +
> +    rc = send_domain_memory(ctx);
> +    if ( rc )
> +        goto err;
> +
> +    /* Refresh domain information now it has paused. */
> +    if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) ||
> +         (ctx->dominfo.domid != ctx->domid) )
> +    {
> +        PERROR("Unable to refresh domain information");
> +        rc = -1;
> +        goto err;
> +    }
> +    else if ( (!ctx->dominfo.shutdown ||
> +               ctx->dominfo.shutdown_reason != SHUTDOWN_suspend ) &&
> +              !ctx->dominfo.paused )
> +    {
> +        ERROR("Domain has not been suspended");
> +        rc = -1;
> +        goto err;
> +    }
> +
> +    rc = ctx->save.ops.end_of_stream(ctx);
> +    if ( rc )
> +        goto err;
> +
> +    rc = write_end_record(ctx);
> +    if ( rc )
> +        goto err;
> +
> +    xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF,
> +                      NULL, 0, NULL, 0, NULL);
> +
> +    IPRINTF("Save successful");
> +    goto done;
> +
> + err:
> +    saved_errno = errno;
> +    saved_rc = rc;
> +    PERROR("Save failed");
> +
> + done:
> +    rc = ctx->save.ops.cleanup(ctx);
> +    if ( rc )
> +        PERROR("Failed to clean up");
> +
> +    if ( saved_rc )
> +    {
> +        rc = saved_rc;
> +        errno = saved_errno;
> +    }
> +
> +    return rc;
> +};
> +
>  int xc_domain_save2(xc_interface *xch, int io_fd, uint32_t dom, uint32_t 
> max_iters,
>                      uint32_t max_factor, uint32_t flags,
>                      struct save_callbacks* callbacks, int hvm)
>  {
> +    struct context ctx =
> +        {
> +            .xch = xch,
> +            .fd = io_fd,
> +        };
> +
> +    /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions :( 
> */
> +    ctx.save.callbacks = callbacks;
> +
>      IPRINTF("In experimental %s", __func__);
> -    return -1;
> +
> +    if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
> +    {
> +        PERROR("Failed to get domain info");
> +        return -1;
> +    }
> +
> +    if ( ctx.dominfo.domid != dom )
> +    {
> +        ERROR("Domain %d does not exist", dom);
> +        return -1;
> +    }
> +
> +    ctx.domid = dom;
> +    IPRINTF("Saving domain %d", dom);
> +
> +    ctx.save.p2m_size = xc_domain_maximum_gpfn(xch, dom) + 1;
> +    if ( ctx.save.p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK )
> +    {
> +        errno = E2BIG;
> +        ERROR("Cannot save this big a guest");
> +        return -1;
> +    }
> +
> +    if ( ctx.dominfo.hvm )
> +    {
> +        ctx.ops = common_ops_x86_hvm;
> +        ctx.save.ops = save_ops_x86_hvm;
> +        return save(&ctx, DHDR_TYPE_X86_HVM);
> +    }
> +    else
> +    {
> +        ctx.ops = common_ops_x86_pv;
> +        ctx.save.ops = save_ops_x86_pv;
> +        return save(&ctx, DHDR_TYPE_X86_PV);
> +    }
>  }
>  
>  /*
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.