# HG changeset patch
# User Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx>
# Date 1176299114 -3600
# Node ID 90a6af455bbd335f1c4f9809e568d1b0af8c074c
# Parent 6e7ef794cdbc1e9ad83e93945a4ece01daebeb0a
[HVM] Save/restore: merge xc_linux_save and xc_hvm_save
into xc_domain_save, like we did for xc_domain_restore
Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx>
---
tools/libxc/xc_hvm_save.c | 755 ---------------
tools/libxc/xc_linux_save.c | 1414 -----------------------------
tools/libxc/Makefile | 4
tools/libxc/ia64/xc_ia64_linux_save.c | 6
tools/libxc/xc_domain_save.c | 1609 ++++++++++++++++++++++++++++++++++
tools/libxc/xenguest.h | 19
tools/libxc/xg_private.c | 11
tools/xcutils/xc_save.c | 9
8 files changed, 1624 insertions(+), 2203 deletions(-)
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/Makefile
--- a/tools/libxc/Makefile Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/libxc/Makefile Wed Apr 11 14:45:14 2007 +0100
@@ -26,8 +26,8 @@ CTRL_SRCS-$(CONFIG_X86_Linux) += xc_ptra
GUEST_SRCS-y :=
GUEST_SRCS-y += xg_private.c
-GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_linux_save.c
-GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_save.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
+GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
# symlink libelf from xen/common/libelf/
LIBELF_SRCS := libelf-tools.c libelf-loader.c
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/ia64/xc_ia64_linux_save.c
--- a/tools/libxc/ia64/xc_ia64_linux_save.c Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/libxc/ia64/xc_ia64_linux_save.c Wed Apr 11 14:45:14 2007 +0100
@@ -134,8 +134,10 @@ retry:
}
int
-xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(int))
+xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+ uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+ int hvm, void *(*init_qemu_maps)(int, unsigned),
+ void (*qemu_flip_buffer)(int, int))
{
DECLARE_DOMCTL;
xc_dominfo_t info;
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xc_domain_save.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_domain_save.c Wed Apr 11 14:45:14 2007 +0100
@@ -0,0 +1,1609 @@
+/******************************************************************************
+ * xc_linux_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/params.h>
+#include <xen/hvm/e820.h>
+
+/*
+** Default values for important tuning parameters. Can override by passing
+** non-zero replacement values to xc_domain_save().
+**
+** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
+**
+*/
+#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
+#define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the current guest */
+static unsigned int pt_levels;
+
+/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* Live mapping of the table mapping each PFN to its current MFN. */
+static xen_pfn_t *live_p2m = NULL;
+
+/* Live mapping of system MFN to PFN table. */
+static xen_pfn_t *live_m2p = NULL;
+static unsigned long m2p_mfn0;
+
+/* grep fodder: machine_to_phys */
+
+#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
+
+/*
+ * Returns TRUE if the given machine frame number has a unique mapping
+ * in the guest's pseudophysical map.
+ */
+#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
+ (((_mfn) < (max_mfn)) && \
+ ((mfn_to_pfn(_mfn) < (p2m_size)) && \
+ (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
+
+/* Returns TRUE if MFN is successfully converted to a PFN. */
+#define translate_mfn_to_pfn(_pmfn) \
+({ \
+ unsigned long mfn = *(_pmfn); \
+ int _res = 1; \
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
+ _res = 0; \
+ else \
+ *(_pmfn) = mfn_to_pfn(mfn); \
+ _res; \
+})
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+ ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+ return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
+static inline unsigned int hweight32(unsigned int w)
+{
+ unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
+ res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+ res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
+ res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
+ return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
+}
+
+static inline int count_bits ( int nr, volatile void *addr)
+{
+ int i, count = 0;
+ volatile unsigned long *p = (volatile unsigned long *)addr;
+ /* We know that the array is padded to unsigned long. */
+ for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
+ count += hweight32(*p);
+ return count;
+}
+
+static inline int permute( int i, int nr, int order_nr )
+{
+ /* Need a simple permutation function so that we scan pages in a
+ pseudo random order, enabling us to get a better estimate of
+ the domain's page dirtying rate as we go (there are often
+ contiguous ranges of pfns that have similar behaviour, and we
+ want to mix them up. */
+
+ /* e.g. nr->oder 15->4 16->4 17->5 */
+ /* 512MB domain, 128k pages, order 17 */
+
+ /*
+ QPONMLKJIHGFEDCBA
+ QPONMLKJIH
+ GFEDCBA
+ */
+
+ /*
+ QPONMLKJIHGFEDCBA
+ EDCBA
+ QPONM
+ LKJIHGF
+ */
+
+ do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
+ while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
+
+ return i;
+}
+
+static uint64_t tv_to_us(struct timeval *new)
+{
+ return (new->tv_sec * 1000000) + new->tv_usec;
+}
+
+static uint64_t llgettimeofday(void)
+{
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ return tv_to_us(&now);
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+ return (((new->tv_sec - old->tv_sec)*1000000) +
+ (new->tv_usec - old->tv_usec));
+}
+
+static int noncached_write(int fd, int live, void *buffer, int len)
+{
+ static int write_count = 0;
+
+ int rc = write(fd,buffer,len);
+
+ write_count += len;
+ if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
+ {
+ /* Time to discard cache - dont care if this fails */
+ discard_file_cache(fd, 0 /* no flush */);
+ write_count = 0;
+ }
+
+ return rc;
+}
+
+#ifdef ADAPTIVE_SAVE
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate).
+*/
+
+#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
+#define START_MBIT_RATE 100 /* initial transmit rate for migrate */
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU 781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+/* We keep track of the current and previous transmission rate */
+static int mbit_rate, ombit_rate = 0;
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
+
+static inline void initialize_mbit_rate()
+{
+ mbit_rate = START_MBIT_RATE;
+}
+
+static int ratewrite(int io_fd, int live, void *buf, int n)
+{
+ static int budget = 0;
+ static int burst_time_us = -1;
+ static struct timeval last_put = { 0 };
+ struct timeval now;
+ struct timespec delay;
+ long long delta;
+
+ if ( START_MBIT_RATE == 0 )
+ return noncached_write(io_fd, live, buf, n);
+
+ budget -= n;
+ if ( budget < 0 )
+ {
+ if ( mbit_rate != ombit_rate )
+ {
+ burst_time_us = RATE_TO_BTU / mbit_rate;
+ ombit_rate = mbit_rate;
+ DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
+ mbit_rate, BURST_BUDGET, burst_time_us);
+ }
+ if ( last_put.tv_sec == 0 )
+ {
+ budget += BURST_BUDGET;
+ gettimeofday(&last_put, NULL);
+ }
+ else
+ {
+ while ( budget < 0 )
+ {
+ gettimeofday(&now, NULL);
+ delta = tv_delta(&now, &last_put);
+ while ( delta > burst_time_us )
+ {
+ budget += BURST_BUDGET;
+ last_put.tv_usec += burst_time_us;
+ if ( last_put.tv_usec > 1000000
+ {
+ last_put.tv_usec -= 1000000;
+ last_put.tv_sec++;
+ }
+ delta -= burst_time_us;
+ }
+ if ( budget > 0 )
+ break;
+ delay.tv_sec = 0;
+ delay.tv_nsec = 1000 * (burst_time_us - delta);
+ while ( delay.tv_nsec > 0 )
+ if ( nanosleep(&delay, &delay) == 0 )
+ break;
+ }
+ }
+ }
+ return noncached_write(io_fd, live, buf, n);
+}
+
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live),
(_buf), (_n))
+#define initialize_mbit_rate()
+
+#endif
+
+static inline ssize_t write_exact(int fd, void *buf, size_t count)
+{
+ return (write(fd, buf, count) == count);
+}
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+ xc_shadow_op_stats_t *stats, int print)
+{
+ static struct timeval wall_last;
+ static long long d0_cpu_last;
+ static long long d1_cpu_last;
+
+ struct timeval wall_now;
+ long long wall_delta;
+ long long d0_cpu_now, d0_cpu_delta;
+ long long d1_cpu_now, d1_cpu_delta;
+
+ gettimeofday(&wall_now, NULL);
+
+ d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+ d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+ if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+ DPRINTF("ARRHHH!!\n");
+
+ wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+ if ( wall_delta == 0 )
+ wall_delta = 1;
+
+ d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+ d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+ if ( print )
+ DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+ "dirtied %dMb/s %" PRId32 " pages\n",
+ wall_delta,
+ (int)((d0_cpu_delta*100)/wall_delta),
+ (int)((d1_cpu_delta*100)/wall_delta),
+ (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+ (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+ stats->dirty_count);
+
+#ifdef ADAPTIVE_SAVE
+ if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
+ {
+ mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
+ + 50;
+ if ( mbit_rate > MAX_MBIT_RATE )
+ mbit_rate = MAX_MBIT_RATE;
+ }
+#endif
+
+ d0_cpu_last = d0_cpu_now;
+ d1_cpu_last = d1_cpu_now;
+ wall_last = wall_now;
+
+ return 0;
+}
+
+
+static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
+ unsigned long *arr, int runs)
+{
+ long long start, now;
+ xc_shadow_op_stats_t stats;
+ int j;
+
+ start = llgettimeofday();
+
+ for ( j = 0; j < runs; j++ )
+ {
+ int i;
+
+ xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+ arr, p2m_size, NULL, 0, NULL);
+ DPRINTF("#Flush\n");
+ for ( i = 0; i < 40; i++ )
+ {
+ usleep(50000);
+ now = llgettimeofday();
+ xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
+ NULL, 0, NULL, 0, &stats);
+ DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
+ ((now-start)+500)/1000,
+ stats.fault_count, stats.dirty_count);
+ }
+ }
+
+ return -1;
+}
+
+
+static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+ int dom, xc_dominfo_t *info,
+ vcpu_guest_context_t *ctxt)
+{
+ int i = 0;
+
+ if ( !(*suspend)(dom) )
+ {
+ ERROR("Suspend request failed");
+ return -1;
+ }
+
+ retry:
+
+ if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
+ {
+ ERROR("Could not get domain info");
+ return -1;
+ }
+
+ if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
+ ERROR("Could not get vcpu context");
+
+
+ if ( info->dying )
+ {
+ ERROR("domain is dying");
+ return -1;
+ }
+
+ if ( info->crashed )
+ {
+ ERROR("domain has crashed");
+ return -1;
+ }
+
+ if ( info->shutdown )
+ {
+ switch ( info->shutdown_reason )
+ {
+ case SHUTDOWN_poweroff:
+ case SHUTDOWN_reboot:
+ ERROR("domain has shut down");
+ return -1;
+ case SHUTDOWN_suspend:
+ return 0;
+ case SHUTDOWN_crash:
+ ERROR("domain has crashed");
+ return -1;
+ }
+ }
+
+ if ( info->paused )
+ {
+ /* Try unpausing domain, wait, and retest. */
+ xc_domain_unpause( xc_handle, dom );
+ ERROR("Domain was paused. Wait and re-test.");
+ usleep(10000); /* 10ms */
+ goto retry;
+ }
+
+ if ( ++i < 100 )
+ {
+ ERROR("Retry suspend domain");
+ usleep(10000); /* 10ms */
+ goto retry;
+ }
+
+ ERROR("Unable to suspend domain.");
+
+ return -1;
+}
+
+/*
+** Map the top-level page of MFNs from the guest. The guest might not have
+** finished resuming from a previous restore operation, so we wait a while for
+** it to update the MFN to a reasonable value.
+*/
+static void *map_frame_list_list(int xc_handle, uint32_t dom,
+ shared_info_t *shinfo)
+{
+ int count = 100;
+ void *p;
+
+ while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
+ usleep(10000);
+
+ if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
+ {
+ ERROR("Timed out waiting for frame list updated.");
+ return NULL;
+ }
+
+ p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
+ shinfo->arch.pfn_to_mfn_frame_list_list);
+ if ( p == NULL )
+ ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
+
+ return p;
+}
+
+/*
+** During transfer (or in the state file), all page-table pages must be
+** converted into a 'canonical' form where references to actual mfns
+** are replaced with references to the corresponding pfns.
+**
+** This function performs the appropriate conversion, taking into account
+** which entries do not require canonicalization (in particular, those
+** entries which map the virtual address reserved for the hypervisor).
+*/
+static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
+ const void *spage, void *dpage)
+{
+
+ int i, pte_last, xen_start, xen_end, race = 0;
+ uint64_t pte;
+
+ /*
+ ** We need to determine which entries in this page table hold
+ ** reserved hypervisor mappings. This depends on the current
+ ** page table type as well as the number of paging levels.
+ */
+ xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
+
+ if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+ xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
+
+ if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
+ xen_start = L3_PAGETABLE_ENTRIES_PAE;
+
+ /*
+ ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
+ ** We can spot this by looking for the guest linear mapping which
+ ** Xen always ensures is present in that L2. Guests must ensure
+ ** that this check will fail for other L2s.
+ */
+ if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+ {
+ int hstart;
+ uint64_t he;
+
+ hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+ he = ((const uint64_t *) spage)[hstart];
+
+ if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+ {
+ /* hvirt starts with xen stuff... */
+ xen_start = hstart;
+ }
+ else if ( hvirt_start != 0xf5800000 )
+ {
+ /* old L2s from before hole was shrunk... */
+ hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+ he = ((const uint64_t *) spage)[hstart];
+ if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+ xen_start = hstart;
+ }
+ }
+
+ if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ /*
+ ** XXX SMH: should compute these from hvirt_start (which we have)
+ ** and hvirt_end (which we don't)
+ */
+ xen_start = 256;
+ xen_end = 272;
+ }
+
+ /* Now iterate through the page table, canonicalizing each PTE */
+ for (i = 0; i < pte_last; i++ )
+ {
+ unsigned long pfn, mfn;
+
+ if ( pt_levels == 2 )
+ pte = ((const uint32_t*)spage)[i];
+ else
+ pte = ((const uint64_t*)spage)[i];
+
+ if ( (i >= xen_start) && (i < xen_end) )
+ pte = 0;
+
+ if ( pte & _PAGE_PRESENT )
+ {
+ mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
+ {
+ /* This will happen if the type info is stale which
+ is quite feasible under live migration */
+ pfn = 0; /* zap it - we'll retransmit this page later */
+ race = 1; /* inform the caller of race; fatal if !live */
+ }
+ else
+ pfn = mfn_to_pfn(mfn);
+
+ pte &= ~MADDR_MASK_X86;
+ pte |= (uint64_t)pfn << PAGE_SHIFT;
+
+ /*
+ * PAE guest L3Es can contain these flags when running on
+ * a 64bit hypervisor. We zap these here to avoid any
+ * surprise at restore time...
+ */
+ if ( (pt_levels == 3) &&
+ (type == XEN_DOMCTL_PFINFO_L3TAB) &&
+ (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
+ pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
+ }
+
+ if ( pt_levels == 2 )
+ ((uint32_t*)dpage)[i] = pte;
+ else
+ ((uint64_t*)dpage)[i] = pte;
+ }
+
+ return race;
+}
+
+static xen_pfn_t *xc_map_m2p(int xc_handle,
+ unsigned long max_mfn,
+ int prot)
+{
+ struct xen_machphys_mfn_list xmml;
+ privcmd_mmap_entry_t *entries;
+ unsigned long m2p_chunks, m2p_size;
+ xen_pfn_t *m2p;
+ xen_pfn_t *extent_start;
+ int i, rc;
+
+ m2p_size = M2P_SIZE(max_mfn);
+ m2p_chunks = M2P_CHUNKS(max_mfn);
+
+ xmml.max_extents = m2p_chunks;
+ if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
+ {
+ ERROR("failed to allocate space for m2p mfns");
+ return NULL;
+ }
+ set_xen_guest_handle(xmml.extent_start, extent_start);
+
+ if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
+ (xmml.nr_extents != m2p_chunks) )
+ {
+ ERROR("xc_get_m2p_mfns");
+ return NULL;
+ }
+
+ if ( (m2p = mmap(NULL, m2p_size, prot,
+ MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
+ {
+ ERROR("failed to mmap m2p");
+ return NULL;
+ }
+
+ if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
+ {
+ ERROR("failed to allocate space for mmap entries");
+ return NULL;
+ }
+
+ for ( i = 0; i < m2p_chunks; i++ )
+ {
+ entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
+ entries[i].mfn = extent_start[i];
+ entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
+ }
+
+ if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
+ entries, m2p_chunks)) < 0 )
+ {
+ ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
+ return NULL;
+ }
+
+ m2p_mfn0 = entries[0].mfn;
+
+ free(extent_start);
+ free(entries);
+
+ return m2p;
+}
+
+
+static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
+ int io_fd,
+ uint32_t dom,
+ vcpu_guest_context_t *ctxt,
+ unsigned long p2m_size,
+ shared_info_t *live_shinfo)
+{
+ /* Double and single indirect references to the live P2M table */
+ xen_pfn_t *live_p2m_frame_list_list = NULL;
+ xen_pfn_t *live_p2m_frame_list = NULL;
+
+ /* A copy of the pfn-to-mfn table frame list. */
+ xen_pfn_t *p2m_frame_list = NULL;
+
+ /* The mapping of the live p2m table itself */
+ xen_pfn_t *p2m = NULL;
+
+ int i, success = 0;
+
+ live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
+ live_shinfo);
+ if ( !live_p2m_frame_list_list )
+ goto out;
+
+ live_p2m_frame_list =
+ xc_map_foreign_batch(xc_handle, dom, PROT_READ,
+ live_p2m_frame_list_list,
+ P2M_FLL_ENTRIES);
+ if ( !live_p2m_frame_list )
+ {
+ ERROR("Couldn't map p2m_frame_list");
+ goto out;
+ }
+
+
+ /* Map all the frames of the pfn->mfn table. For migrate to succeed,
+ the guest must not change which frames are used for this purpose.
+ (its not clear why it would want to change them, and we'll be OK
+ from a safety POV anyhow. */
+
+ p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
+ live_p2m_frame_list,
+ P2M_FL_ENTRIES);
+ if ( !p2m )
+ {
+ ERROR("Couldn't map p2m table");
+ goto out;
+ }
+ live_p2m = p2m; /* So that translation macros will work */
+
+ /* Get a local copy of the live_P2M_frame_list */
+ if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
+ {
+ ERROR("Couldn't allocate p2m_frame_list array");
+ goto out;
+ }
+ memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
+
+ /* Canonicalise the pfn-to-mfn table frame-number list. */
+ for ( i = 0; i < p2m_size; i += fpp )
+ {
+ if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
+ {
+ ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
+ ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
+ (uint64_t)p2m_frame_list[i/fpp]);
+ goto out;
+ }
+ }
+
+ /*
+ * Write an extended-info structure to inform the restore code that
+ * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
+ * slow paths in the restore code.
+ */
+ if ( (pt_levels == 3) &&
+ (ctxt->vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
+ {
+ unsigned long signature = ~0UL;
+ uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8;
+ uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
+ char chunk_sig[] = "vcpu";
+ if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
+ !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
+ !write_exact(io_fd, &chunk_sig, 4) ||
+ !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) ||
+ !write_exact(io_fd, ctxt, sizeof(*ctxt)) )
+ {
+ ERROR("write: extended info");
+ goto out;
+ }
+ }
+
+ if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
+ {
+ ERROR("write: p2m_frame_list");
+ goto out;
+ }
+
+ success = 1;
+
+ out:
+
+ if ( !success && p2m )
+ munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
+
+ if ( live_p2m_frame_list_list )
+ munmap(live_p2m_frame_list_list, PAGE_SIZE);
+
+ if ( live_p2m_frame_list )
+ munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
+
+ if ( p2m_frame_list )
+ free(p2m_frame_list);
+
+ return success ? p2m : NULL;
+}
+
+
+
+int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+ uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+ int hvm, void *(*init_qemu_maps)(int, unsigned),
+ void (*qemu_flip_buffer)(int, int))
+{
+ xc_dominfo_t info;
+
+ int rc = 1, i, j, last_iter, iter = 0;
+ int live = (flags & XCFLAGS_LIVE);
+ int debug = (flags & XCFLAGS_DEBUG);
+ int race = 0, sent_last_iter, skip_this_iter;
+
+ /* The new domain's shared-info frame number. */
+ unsigned long shared_info_frame;
+
+ /* A copy of the CPU context of the guest. */
+ vcpu_guest_context_t ctxt;
+
+ /* A table containing the type of each PFN (/not/ MFN!). */
+ unsigned long *pfn_type = NULL;
+ unsigned long *pfn_batch = NULL;
+
+ /* A copy of one frame of guest memory. */
+ char page[PAGE_SIZE];
+
+ /* Live mapping of shared info structure */
+ shared_info_t *live_shinfo = NULL;
+
+ /* base of the region in which domain memory is mapped */
+ unsigned char *region_base = NULL;
+
+ /* power of 2 order of p2m_size */
+ int order_nr;
+
+ /* bitmap of pages:
+ - that should be sent this iteration (unless later marked as skip);
+ - to skip this iteration because already dirty;
+ - to fixup by sending at the end if not already resent; */
+ unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
+
+ xc_shadow_op_stats_t stats;
+
+ unsigned long needed_to_fix = 0;
+ unsigned long total_sent = 0;
+
+ uint64_t vcpumap = 1ULL;
+
+ /* HVM: a buffer for holding HVM context */
+ uint32_t hvm_buf_size = 0;
+ uint8_t *hvm_buf = NULL;
+
+ /* HVM: magic frames for ioreqs and xenstore comms. */
+ uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+ /* If no explicit control parameters given, use defaults */
+ max_iters = max_iters ? : DEF_MAX_ITERS;
+ max_factor = max_factor ? : DEF_MAX_FACTOR;
+
+ initialize_mbit_rate();
+
+ if ( !get_platform_info(xc_handle, dom,
+ &max_mfn, &hvirt_start, &pt_levels) )
+ {
+ ERROR("Unable to get platform info.");
+ return 1;
+ }
+
+ if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
+ {
+ ERROR("Could not get domain info");
+ return 1;
+ }
+
+ if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
+ {
+ ERROR("Could not get vcpu context");
+ goto out;
+ }
+ shared_info_frame = info.shared_info_frame;
+
+ /* Map the shared info frame */
+ if ( !hvm )
+ {
+ live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+ PROT_READ, shared_info_frame);
+ if ( !live_shinfo )
+ {
+ ERROR("Couldn't map live_shinfo");
+ goto out;
+ }
+ }
+
+ /* Get the size of the P2M table */
+ p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
+
+ /* Domain is still running at this point */
+ if ( live )
+ {
+ /* Live suspend. Enable log-dirty mode. */
+ if ( xc_shadow_control(xc_handle, dom,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL) < 0 )
+ {
+ ERROR("Couldn't enable shadow mode");
+ goto out;
+ }
+
+ if ( hvm )
+ {
+ /* Get qemu-dm logging dirty pages too */
+ void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+ qemu_bitmaps[0] = seg;
+ qemu_bitmaps[1] = seg + BITMAP_SIZE;
+ qemu_active = 0;
+ qemu_non_active = 1;
+ }
+ }
+ else
+ {
+ /* This is a non-live suspend. Suspend the domain .*/
+ if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
+ {
+ ERROR("Domain appears not to have suspended");
+ goto out;
+ }
+ }
+
+ last_iter = !live;
+
+ /* pretend we sent all the pages last iteration */
+ sent_last_iter = p2m_size;
+
+ /* calculate the power of 2 order of p2m_size, e.g.
+ 15->4 16->4 17->5 */
+ for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
+ continue;
+
+ /* Setup to_send / to_fix and to_skip bitmaps */
+ to_send = malloc(BITMAP_SIZE);
+ to_fix = calloc(1, BITMAP_SIZE);
+ to_skip = malloc(BITMAP_SIZE);
+
+ if ( !to_send || !to_fix || !to_skip )
+ {
+ ERROR("Couldn't allocate to_send array");
+ goto out;
+ }
+
+ memset(to_send, 0xff, BITMAP_SIZE);
+
+ if ( lock_pages(to_send, BITMAP_SIZE) )
+ {
+ ERROR("Unable to lock to_send");
+ return 1;
+ }
+
+ /* (to fix is local only) */
+ if ( lock_pages(to_skip, BITMAP_SIZE) )
+ {
+ ERROR("Unable to lock to_skip");
+ return 1;
+ }
+
+ if ( hvm )
+ {
+ /* Need another buffer for HVM context */
+ hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
+ if ( hvm_buf_size == -1 )
+ {
+ ERROR("Couldn't get HVM context size from Xen");
+ goto out;
+ }
+ hvm_buf = malloc(hvm_buf_size);
+ if ( !hvm_buf )
+ {
+ ERROR("Couldn't allocate memory");
+ goto out;
+ }
+ }
+
+ analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
+
+ /* We want zeroed memory so use calloc rather than malloc. */
+ pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
+ pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
+ if ( (pfn_type == NULL) || (pfn_batch == NULL) )
+ {
+ ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
+ errno = ENOMEM;
+ goto out;
+ }
+
+ if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
+ {
+ ERROR("Unable to lock");
+ goto out;
+ }
+
+ /* Setup the mfn_to_pfn table mapping */
+ if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
+ {
+ ERROR("Failed to map live M2P table");
+ goto out;
+ }
+
+ /* Start writing out the saved-domain record. */
+ if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+ {
+ ERROR("write: p2m_size");
+ goto out;
+ }
+
+ if ( !hvm )
+ {
+ int err = 0;
+ unsigned long mfn;
+
+ /* Map the P2M table, and write the list of P2M frames */
+ live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom,
+ &ctxt, p2m_size, live_shinfo);
+ if ( live_p2m == NULL )
+ {
+ ERROR("Failed to map/save the p2m frame list");
+ goto out;
+ }
+
+ /*
+ * Quick belt and braces sanity check.
+ */
+
+ for ( i = 0; i < p2m_size; i++ )
+ {
+ mfn = live_p2m[i];
+ if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
+ {
+ DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
+ mfn, mfn_to_pfn(mfn));
+ err++;
+ }
+ }
+ DPRINTF("Had %d unexplained entries in p2m table\n", err);
+ }
+
+ print_stats(xc_handle, dom, 0, &stats, 0);
+
+ /* Now write out each data page, canonicalising page tables as we go... */
+ for ( ; ; )
+ {
+ unsigned int prev_pc, sent_this_iter, N, batch;
+
+ iter++;
+ sent_this_iter = 0;
+ skip_this_iter = 0;
+ prev_pc = 0;
+ N = 0;
+
+ DPRINTF("Saving memory pages: iter %d 0%%", iter);
+
+ while ( N < p2m_size )
+ {
+ unsigned int this_pc = (N * 100) / p2m_size;
+ int rc;
+
+ if ( (this_pc - prev_pc) >= 5 )
+ {
+ DPRINTF("\b\b\b\b%3d%%", this_pc);
+ prev_pc = this_pc;
+ }
+
+ if ( !last_iter )
+ {
+ /* Slightly wasteful to peek the whole array evey time,
+ but this is fast enough for the moment. */
+ rc = xc_shadow_control(
+ xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
+ p2m_size, NULL, 0, NULL);
+ if ( rc != p2m_size )
+ {
+ ERROR("Error peeking shadow bitmap");
+ goto out;
+ }
+ }
+
+ /* load pfn_type[] with the mfn of all the pages we're doing in
+ this batch. */
+ for ( batch = 0;
+ (batch < MAX_BATCH_SIZE) && (N < p2m_size);
+ N++ )
+ {
+ int n = permute(N, p2m_size, order_nr);
+
+ if ( debug )
+ DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
+ iter, (unsigned long)n, hvm ? 0 : live_p2m[n],
+ test_bit(n, to_send),
+ hvm ? 0 : mfn_to_pfn(live_p2m[n]&0xFFFFF));
+
+ if ( !last_iter &&
+ test_bit(n, to_send) &&
+ test_bit(n, to_skip) )
+ skip_this_iter++; /* stats keeping */
+
+ if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
+ (test_bit(n, to_send) && last_iter) ||
+ (test_bit(n, to_fix) && last_iter)) )
+ continue;
+
+ /* Skip PFNs that aren't really there */
+ if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
+ || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
+ && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
+ continue;
+
+ /*
+ ** we get here if:
+ ** 1. page is marked to_send & hasn't already been re-dirtied
+ ** 2. (ignore to_skip in last iteration)
+ ** 3. add in pages that still need fixup (net bufs)
+ */
+
+ pfn_batch[batch] = n;
+
+ /* Hypercall interfaces operate in PFNs for HVM guests
+ * and MFNs for PV guests */
+ if ( hvm )
+ pfn_type[batch] = n;
+ else
+ pfn_type[batch] = live_p2m[n];
+
+ if ( !is_mapped(pfn_type[batch]) )
+ {
+ /*
+ ** not currently in psuedo-physical map -- set bit
+ ** in to_fix since we must send this page in last_iter
+ ** unless its sent sooner anyhow, or it never enters
+ ** pseudo-physical map (e.g. for ballooned down doms)
+ */
+ set_bit(n, to_fix);
+ continue;
+ }
+
+ if ( last_iter &&
+ test_bit(n, to_fix) &&
+ !test_bit(n, to_send) )
+ {
+ needed_to_fix++;
+ DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+ iter, n, pfn_type[batch]);
+ }
+
+ clear_bit(n, to_fix);
+
+ batch++;
+ }
+
+ if ( batch == 0 )
+ goto skip; /* vanishingly unlikely... */
+
+ region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_READ, pfn_type, batch);
+ if ( region_base == NULL )
+ {
+ ERROR("map batch failed");
+ goto out;
+ }
+
+ if ( !hvm )
+ {
+ /* Get page types */
+ for ( j = 0; j < batch; j++ )
+ ((uint32_t *)pfn_type)[j] = pfn_type[j];
+ if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
+ (uint32_t *)pfn_type) )
+ {
+ ERROR("get_pfn_type_batch failed");
+ goto out;
+ }
+ for ( j = batch-1; j >= 0; j-- )
+ pfn_type[j] = ((uint32_t *)pfn_type)[j];
+
+ for ( j = 0; j < batch; j++ )
+ {
+
+ if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
+ XEN_DOMCTL_PFINFO_XTAB )
+ {
+ DPRINTF("type fail: page %i mfn %08lx\n",
+ j, pfn_type[j]);
+ continue;
+ }
+
+ if ( debug )
+ DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
+ " sum= %08lx\n",
+ iter,
+ (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
+ pfn_batch[j],
+ pfn_type[j],
+ mfn_to_pfn(pfn_type[j] &
+ ~XEN_DOMCTL_PFINFO_LTAB_MASK),
+ csum_page(region_base + (PAGE_SIZE*j)));
+
+ /* canonicalise mfn->pfn */
+ pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
+ pfn_batch[j];
+ }
+ }
+
+ if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
+ {
+ ERROR("Error when writing to state file (2) (errno %d)",
+ errno);
+ goto out;
+ }
+
+ if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+ {
+ ERROR("Error when writing to state file (3) (errno %d)",
+ errno);
+ goto out;
+ }
+
+ /* entering this loop, pfn_type is now in pfns (Not mfns) */
+ for ( j = 0; j < batch; j++ )
+ {
+ unsigned long pfn, pagetype;
+ void *spage = (char *)region_base + (PAGE_SIZE*j);
+
+ pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ /* write out pages in batch */
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ continue;
+
+ pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ /* We have a pagetable page: need to rewrite it. */
+ race =
+ canonicalize_pagetable(pagetype, pfn, spage, page);
+
+ if ( race && !live )
+ {
+ ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
+ pagetype);
+ goto out;
+ }
+
+ if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
+ {
+ ERROR("Error when writing to state file (4)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+ else
+ {
+ /* We have a normal page: just write it directly. */
+ if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
+ PAGE_SIZE )
+ {
+ ERROR("Error when writing to state file (5)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+ } /* end of the write out for this batch */
+
+ sent_this_iter += batch;
+
+ munmap(region_base, batch*PAGE_SIZE);
+
+ } /* end of this while loop for this iteration */
+
+ skip:
+
+ total_sent += sent_this_iter;
+
+ DPRINTF("\r %d: sent %d, skipped %d, ",
+ iter, sent_this_iter, skip_this_iter );
+
+ if ( last_iter )
+ {
+ print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+ DPRINTF("Total pages sent= %ld (%.2fx)\n",
+ total_sent, ((float)total_sent)/p2m_size );
+ DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
+ }
+
+ if ( last_iter && debug )
+ {
+ int minusone = -1;
+ memset(to_send, 0xff, BITMAP_SIZE);
+ debug = 0;
+ DPRINTF("Entering debug resend-all mode\n");
+
+ /* send "-1" to put receiver into debug mode */
+ if ( !write_exact(io_fd, &minusone, sizeof(int)) )
+ {
+ ERROR("Error when writing to state file (6) (errno %d)",
+ errno);
+ goto out;
+ }
+
+ continue;
+ }
+
+ if ( last_iter )
+ break;
+
+ if ( live )
+ {
+ if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
+ (iter >= max_iters) ||
+ (sent_this_iter+skip_this_iter < 50) ||
+ (total_sent > p2m_size*max_factor) )
+ {
+ DPRINTF("Start last iteration\n");
+ last_iter = 1;
+
+ if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
+ &ctxt) )
+ {
+ ERROR("Domain appears not to have suspended");
+ goto out;
+ }
+
+ DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
+ info.shared_info_frame,
+ (unsigned long)ctxt.user_regs.eip,
+ (unsigned long)ctxt.user_regs.edx);
+ }
+
+ if ( xc_shadow_control(xc_handle, dom,
+ XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
+ p2m_size, NULL, 0, &stats) != p2m_size )
+ {
+ ERROR("Error flushing shadow PT");
+ goto out;
+ }
+
+ if ( hvm )
+ {
+ /* Pull in the dirty bits from qemu-dm too */
+ if ( !last_iter )
+ {
+ qemu_active = qemu_non_active;
+ qemu_non_active = qemu_active ? 0 : 1;
+ qemu_flip_buffer(dom, qemu_active);
+ for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
+ {
+ to_send[j] |= qemu_bitmaps[qemu_non_active][j];
+ qemu_bitmaps[qemu_non_active][j] = 0;
+ }
+ }
+ else
+ {
+ for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
+ to_send[j] |= qemu_bitmaps[qemu_active][j];
+ }
+ }
+
+ sent_last_iter = sent_this_iter;
+
+ print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
+
+ }
+ } /* end of infinite for loop */
+
+ DPRINTF("All memory is saved\n");
+
+ {
+ struct {
+ int minustwo;
+ int max_vcpu_id;
+ uint64_t vcpumap;
+ } chunk = { -2, info.max_vcpu_id };
+
+ if ( info.max_vcpu_id >= 64 )
+ {
+ ERROR("Too many VCPUS in guest!");
+ goto out;
+ }
+
+ for ( i = 1; i <= info.max_vcpu_id; i++ )
+ {
+ xc_vcpuinfo_t vinfo;
+ if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
+ vinfo.online )
+ vcpumap |= 1ULL << i;
+ }
+
+ chunk.vcpumap = vcpumap;
+ if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ ERROR("Error when writing to state file (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ /* Zero terminate */
+ i = 0;
+ if ( !write_exact(io_fd, &i, sizeof(int)) )
+ {
+ ERROR("Error when writing to state file (6') (errno %d)", errno);
+ goto out;
+ }
+
+ if ( hvm )
+ {
+ uint32_t rec_size;
+
+ /* Save magic-page locations. */
+ memset(magic_pfns, 0, sizeof(magic_pfns));
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+ (unsigned long *)&magic_pfns[0]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+ (unsigned long *)&magic_pfns[1]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+ (unsigned long *)&magic_pfns[2]);
+ if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+ {
+ ERROR("Error when writing to state file (7)");
+ goto out;
+ }
+
+ /* Save vcpu contexts */
+
+ for ( i = 0; i <= info.max_vcpu_id; i++ )
+ {
+ if ( !(vcpumap & (1ULL << i)) )
+ continue;
+
+ if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
+ {
+ ERROR("HVM:Could not get vcpu context");
+ goto out;
+ }
+
+ DPRINTF("write vcpu %d context.\n", i);
+ if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
+ {
+ ERROR("write vcpu context failed!\n");
+ goto out;
+ }
+ }
+
+ /* Get HVM context from Xen and save it too */
+ if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
+ hvm_buf_size)) == -1 )
+ {
+ ERROR("HVM:Could not get hvm buffer");
+ goto out;
+ }
+
+ if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+ {
+ ERROR("error write hvm buffer size");
+ goto out;
+ }
+
+ if ( !write_exact(io_fd, hvm_buf, rec_size) )
+ {
+ ERROR("write HVM info failed!\n");
+ goto out;
+ }
+
+ /* HVM guests are done now */
+ rc = 0;
+ goto out;
+ }
+
+ /* PV guests only from now on */
+
+ /* Send through a list of all the PFNs that were not in map at the close */
+ {
+ unsigned int i,j;
+ unsigned long pfntab[1024];
+
+ for ( i = 0, j = 0; i < p2m_size; i++ )
+ {
+ if ( !is_mapped(live_p2m[i]) )
+ j++;
+ }
+
+ if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
+ {
+ ERROR("Error when writing to state file (6a) (errno %d)", errno);
+ goto out;
+ }
+
+ for ( i = 0, j = 0; i < p2m_size; )
+ {
+ if ( !is_mapped(live_p2m[i]) )
+ pfntab[j++] = i;
+
+ i++;
+ if ( (j == 1024) || (i == p2m_size) )
+ {
+ if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
+ {
+ ERROR("Error when writing to state file (6b) (errno %d)",
+ errno);
+ goto out;
+ }
+ j = 0;
+ }
+ }
+ }
+
+ /* Canonicalise the suspend-record frame number. */
+ if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
+ {
+ ERROR("Suspend record is not in range of pseudophys map");
+ goto out;
+ }
+
+ for ( i = 0; i <= info.max_vcpu_id; i++ )
+ {
+ if ( !(vcpumap & (1ULL << i)) )
+ continue;
+
+ if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
+ {
+ ERROR("No context for VCPU%d", i);
+ goto out;
+ }
+
+ /* Canonicalise each GDT frame number. */
+ for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
+ {
+ if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
+ {
+ ERROR("GDT frame is not in range of pseudophys map");
+ goto out;
+ }
+ }
+
+ /* Canonicalise the page table base pointer. */
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
+ {
+ ERROR("PT base is not in range of pseudophys map");
+ goto out;
+ }
+ ctxt.ctrlreg[3] =
+ xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
+
+ /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
+ if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
+ {
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
+ {
+ ERROR("PT base is not in range of pseudophys map");
+ goto out;
+ }
+ /* Least-significant bit means 'valid PFN'. */
+ ctxt.ctrlreg[1] = 1 |
+ xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
+ }
+
+ if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
+ {
+ ERROR("Error when writing to state file (1) (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ /*
+ * Reset the MFN to be a known-invalid value. See map_frame_list_list().
+ */
+ memcpy(page, live_shinfo, PAGE_SIZE);
+ ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
+ if ( !write_exact(io_fd, page, PAGE_SIZE) )
+ {
+ ERROR("Error when writing to state file (1) (errno %d)", errno);
+ goto out;
+ }
+
+ /* Success! */
+ rc = 0;
+
+ out:
+
+ if ( live )
+ {
+ if ( xc_shadow_control(xc_handle, dom,
+ XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL) < 0 )
+ DPRINTF("Warning - couldn't disable shadow mode");
+ }
+
+ /* Flush last write and discard cache for file. */
+ discard_file_cache(io_fd, 1 /* flush */);
+
+ if ( live_shinfo )
+ munmap(live_shinfo, PAGE_SIZE);
+
+ if ( live_p2m )
+ munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
+
+ if ( live_m2p )
+ munmap(live_m2p, M2P_SIZE(max_mfn));
+
+ free(pfn_type);
+ free(pfn_batch);
+ free(to_send);
+ free(to_fix);
+ free(to_skip);
+
+ DPRINTF("Save exit rc=%d\n",rc);
+
+ return !!rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xc_hvm_save.c
--- a/tools/libxc/xc_hvm_save.c Wed Apr 11 09:29:00 2007 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,755 +0,0 @@
-/******************************************************************************
- * xc_hvm_save.c
- *
- * Save the state of a running HVM guest.
- *
- * Copyright (c) 2003, K A Fraser.
- * Copyright (c) 2006 Intel Corperation
- * rewriten for hvm guest by Zhai Edwin <edwin.zhai@xxxxxxxxx>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- */
-
-#include <inttypes.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/time.h>
-
-#include "xc_private.h"
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-#include <xen/hvm/e820.h>
-#include <xen/hvm/params.h>
-
-/*
-** Default values for important tuning parameters. Can override by passing
-** non-zero replacement values to xc_hvm_save().
-**
-** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
-**
-*/
-#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
-#define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
-
-/* Shared-memory bitmaps for getting log-dirty bits from qemu */
-static unsigned long *qemu_bitmaps[2];
-static int qemu_active;
-static int qemu_non_active;
-
-/*
-** During (live) save/migrate, we maintain a number of bitmaps to track
-** which pages we have to send, to fixup, and to skip.
-*/
-
-#define BITS_PER_LONG (sizeof(unsigned long) * 8)
-#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
-#define BITMAP_SIZE (BITS_TO_LONGS(pfn_array_size) * sizeof(unsigned long))
-
-#define BITMAP_ENTRY(_nr,_bmap) \
- ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
-
-#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
-
-static inline int test_bit (int nr, volatile void * addr)
-{
- return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
-}
-
-static inline void clear_bit (int nr, volatile void * addr)
-{
- BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
-}
-
-static inline int permute( int i, int nr, int order_nr )
-{
- /* Need a simple permutation function so that we scan pages in a
- pseudo random order, enabling us to get a better estimate of
- the domain's page dirtying rate as we go (there are often
- contiguous ranges of pfns that have similar behaviour, and we
- want to mix them up. */
-
- /* e.g. nr->oder 15->4 16->4 17->5 */
- /* 512MB domain, 128k pages, order 17 */
-
- /*
- QPONMLKJIHGFEDCBA
- QPONMLKJIH
- GFEDCBA
- */
-
- /*
- QPONMLKJIHGFEDCBA
- EDCBA
- QPONM
- LKJIHGF
- */
-
- do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
- while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
-
- return i;
-}
-
-
-static uint64_t tv_to_us(struct timeval *new)
-{
- return (new->tv_sec * 1000000) + new->tv_usec;
-}
-
-static uint64_t llgettimeofday(void)
-{
- struct timeval now;
- gettimeofday(&now, NULL);
- return tv_to_us(&now);
-}
-
-static uint64_t tv_delta(struct timeval *new, struct timeval *old)
-{
- return (((new->tv_sec - old->tv_sec)*1000000) +
- (new->tv_usec - old->tv_usec));
-}
-
-
-#define RATE_IS_MAX() (0)
-#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
-#define initialize_mbit_rate()
-
-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
- return (write(fd, buf, count) == count);
-}
-
-static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
- xc_shadow_op_stats_t *stats, int print)
-{
- static struct timeval wall_last;
- static long long d0_cpu_last;
- static long long d1_cpu_last;
-
- struct timeval wall_now;
- long long wall_delta;
- long long d0_cpu_now, d0_cpu_delta;
- long long d1_cpu_now, d1_cpu_delta;
-
- gettimeofday(&wall_now, NULL);
-
- d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
- d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
-
- if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
- DPRINTF("ARRHHH!!\n");
-
- wall_delta = tv_delta(&wall_now,&wall_last)/1000;
- if ( wall_delta == 0 )
- wall_delta = 1;
-
- d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
- d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
-
- if ( print )
- DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
- "dirtied %dMb/s %" PRId32 " pages\n",
- wall_delta,
- (int)((d0_cpu_delta*100)/wall_delta),
- (int)((d1_cpu_delta*100)/wall_delta),
- (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
- (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
- stats->dirty_count);
-
- d0_cpu_last = d0_cpu_now;
- d1_cpu_last = d1_cpu_now;
- wall_last = wall_now;
-
- return 0;
-}
-
-static int analysis_phase(int xc_handle, uint32_t domid, int pfn_array_size,
- unsigned long *arr, int runs)
-{
- long long start, now;
- xc_shadow_op_stats_t stats;
- int j;
-
- start = llgettimeofday();
-
- for ( j = 0; j < runs; j++ )
- {
- int i;
-
- xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
- arr, pfn_array_size, NULL, 0, NULL);
- DPRINTF("#Flush\n");
- for ( i = 0; i < 40; i++ )
- {
- usleep(50000);
- now = llgettimeofday();
- xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
- NULL, 0, NULL, 0, &stats);
- DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
- ((now-start)+500)/1000,
- stats.fault_count, stats.dirty_count);
- }
- }
-
- return -1;
-}
-
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
- int dom, xc_dominfo_t *info,
- vcpu_guest_context_t *ctxt)
-{
- int i = 0;
-
- if ( !(*suspend)(dom) )
- {
- ERROR("Suspend request failed");
- return -1;
- }
-
- retry:
-
- if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
- {
- ERROR("Could not get domain info");
- return -1;
- }
-
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
- ERROR("Could not get vcpu context");
-
- if ( info->shutdown && (info->shutdown_reason == SHUTDOWN_suspend) )
- return 0; /* success */
-
- if ( info->paused )
- {
- /* Try unpausing domain, wait, and retest. */
- xc_domain_unpause( xc_handle, dom );
- ERROR("Domain was paused. Wait and re-test.");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- if ( ++i < 100 )
- {
- ERROR("Retry suspend domain.");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- ERROR("Unable to suspend domain.");
-
- return -1;
-}
-
-int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(int),
- void *(*init_qemu_maps)(int, unsigned),
- void (*qemu_flip_buffer)(int, int))
-{
- xc_dominfo_t info;
-
- int rc = 1, i, j, last_iter, iter = 0;
- int live = !!(flags & XCFLAGS_LIVE);
- int debug = !!(flags & XCFLAGS_DEBUG);
- int sent_last_iter, skip_this_iter;
-
- /* The highest guest-physical frame number used by the current guest */
- unsigned long max_pfn;
-
- /* The size of an array big enough to contain all guest pfns */
- unsigned long pfn_array_size;
-
- /* Magic frames: ioreqs and xenstore comms. */
- uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
-
- /* A copy of the CPU context of the guest. */
- vcpu_guest_context_t ctxt;
-
- /* A table containg the PFNs (/not/ MFN!) to map. */
- xen_pfn_t *pfn_batch = NULL;
-
- /* A copy of hvm domain context buffer*/
- uint32_t hvm_buf_size;
- uint8_t *hvm_buf = NULL;
-
- /* base of the region in which domain memory is mapped */
- unsigned char *region_base = NULL;
-
- uint32_t rec_size, nr_vcpus;
-
- /* power of 2 order of pfn_array_size */
- int order_nr;
-
- /* bitmap of pages:
- - that should be sent this iteration (unless later marked as skip);
- - to skip this iteration because already dirty; */
- unsigned long *to_send = NULL, *to_skip = NULL;
-
- xc_shadow_op_stats_t stats;
-
- unsigned long total_sent = 0;
-
- uint64_t vcpumap = 1ULL;
-
- DPRINTF("xc_hvm_save: dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, "
- "live=%d, debug=%d.\n", dom, max_iters, max_factor, flags,
- live, debug);
-
- /* If no explicit control parameters given, use defaults */
- max_iters = max_iters ? : DEF_MAX_ITERS;
- max_factor = max_factor ? : DEF_MAX_FACTOR;
-
- initialize_mbit_rate();
-
- if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
- {
- ERROR("HVM: Could not get domain info");
- return 1;
- }
- nr_vcpus = info.nr_online_vcpus;
-
- if ( mlock(&ctxt, sizeof(ctxt)) )
- {
- ERROR("HVM: Unable to mlock ctxt");
- return 1;
- }
-
- /* Only have to worry about vcpu 0 even for SMP */
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
- {
- ERROR("HVM: Could not get vcpu context");
- goto out;
- }
-
- DPRINTF("saved hvm domain info: max_memkb=0x%lx, nr_pages=0x%lx\n",
- info.max_memkb, info.nr_pages);
-
- if ( live )
- {
- /* Live suspend. Enable log-dirty mode. */
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL, 0, NULL) < 0 )
- {
- ERROR("Couldn't enable shadow mode");
- goto out;
- }
- }
- else
- {
- /* This is a non-live suspend. Suspend the domain .*/
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
- {
- ERROR("HVM Domain appears not to have suspended");
- goto out;
- }
- }
-
- last_iter = !live;
-
- max_pfn = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
-
- DPRINTF("after 1st handle hvm domain max_pfn=0x%lx, "
- "max_memkb=0x%lx, live=%d.\n",
- max_pfn, info.max_memkb, live);
-
- /* Size of any array that covers 0 ... max_pfn */
- pfn_array_size = max_pfn + 1;
- if ( !write_exact(io_fd, &pfn_array_size, sizeof(unsigned long)) )
- {
- ERROR("Error when writing to state file (1)");
- goto out;
- }
-
- /* pretend we sent all the pages last iteration */
- sent_last_iter = pfn_array_size;
-
- /* calculate the power of 2 order of pfn_array_size, e.g.
- 15->4 16->4 17->5 */
- for ( i = pfn_array_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
- continue;
-
- /* Setup to_send / to_fix and to_skip bitmaps */
- to_send = malloc(BITMAP_SIZE);
- to_skip = malloc(BITMAP_SIZE);
-
- if ( live )
- {
- /* Get qemu-dm logging dirty pages too */
- void *seg = init_qemu_maps(dom, BITMAP_SIZE);
- qemu_bitmaps[0] = seg;
- qemu_bitmaps[1] = seg + BITMAP_SIZE;
- qemu_active = 0;
- qemu_non_active = 1;
- }
-
- hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
- if ( hvm_buf_size == -1 )
- {
- ERROR("Couldn't get HVM context size from Xen");
- goto out;
- }
- hvm_buf = malloc(hvm_buf_size);
-
- if ( !to_send || !to_skip || !hvm_buf )
- {
- ERROR("Couldn't allocate memory");
- goto out;
- }
-
- memset(to_send, 0xff, BITMAP_SIZE);
-
- if ( lock_pages(to_send, BITMAP_SIZE) )
- {
- ERROR("Unable to lock to_send");
- return 1;
- }
-
- /* (to fix is local only) */
- if ( lock_pages(to_skip, BITMAP_SIZE) )
- {
- ERROR("Unable to lock to_skip");
- return 1;
- }
-
- analysis_phase(xc_handle, dom, pfn_array_size, to_skip, 0);
-
- /* We want zeroed memory so use calloc rather than malloc. */
- pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
- if ( pfn_batch == NULL )
- {
- ERROR("failed to alloc memory for pfn_batch array");
- errno = ENOMEM;
- goto out;
- }
-
- for ( ; ; )
- {
- unsigned int prev_pc, sent_this_iter, N, batch;
-
- iter++;
- sent_this_iter = 0;
- skip_this_iter = 0;
- prev_pc = 0;
- N=0;
-
- DPRINTF("Saving memory pages: iter %d 0%%", iter);
-
- while ( N < pfn_array_size )
- {
- unsigned int this_pc = (N * 100) / pfn_array_size;
- int rc;
-
- if ( (this_pc - prev_pc) >= 5 )
- {
- DPRINTF("\b\b\b\b%3d%%", this_pc);
- prev_pc = this_pc;
- }
-
- if ( !last_iter )
- {
- /* Slightly wasteful to peek the whole array evey time,
- but this is fast enough for the moment. */
- rc = xc_shadow_control(
- xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
- pfn_array_size, NULL, 0, NULL);
- if ( rc != pfn_array_size )
- {
- ERROR("Error peeking shadow bitmap");
- goto out;
- }
- }
-
- /* load pfn_batch[] with the mfn of all the pages we're doing in
- this batch. */
- for ( batch = 0;
- (batch < MAX_BATCH_SIZE) && (N < pfn_array_size);
- N++ )
- {
- int n = permute(N, pfn_array_size, order_nr);
-
- if ( 0 && debug )
- DPRINTF("%d pfn= %08lx %d \n",
- iter, (unsigned long)n, test_bit(n, to_send));
-
- if ( !last_iter &&
- test_bit(n, to_send) &&
- test_bit(n, to_skip) )
- skip_this_iter++; /* stats keeping */
-
- if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
- (test_bit(n, to_send) && last_iter)) )
- continue;
-
- /* Skip PFNs that aren't really there */
- if ( (n >= 0xa0 && n < 0xc0) /* VGA hole */
- || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) &&
- n < (1ULL << 32) >> PAGE_SHIFT) /* 4G MMIO hole */ )
- continue;
-
- /*
- ** we get here if:
- ** 1. page is marked to_send & hasn't already been re-dirtied
- ** 2. (ignore to_skip in last iteration)
- */
-
- pfn_batch[batch] = n;
-
- batch++;
- }
-
- if ( batch == 0 )
- goto skip; /* vanishingly unlikely... */
-
- region_base = xc_map_foreign_batch(
- xc_handle, dom, PROT_READ, pfn_batch, batch);
- if ( region_base == 0 )
- {
- ERROR("map batch failed");
- goto out;
- }
-
- /* write num of pfns */
- if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
- {
- ERROR("Error when writing to state file (2)");
- goto out;
- }
-
- /* write all the pfns */
- if ( !write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch) )
- {
- ERROR("Error when writing to state file (3)");
- goto out;
- }
-
- for ( j = 0; j < batch; j++ )
- {
- if ( pfn_batch[j] & XEN_DOMCTL_PFINFO_LTAB_MASK )
- continue;
- if ( ratewrite(io_fd, region_base + j*PAGE_SIZE,
- PAGE_SIZE) != PAGE_SIZE )
- {
- ERROR("ERROR when writing to state file (4)");
- goto out;
- }
- }
-
- sent_this_iter += batch;
-
- munmap(region_base, batch*PAGE_SIZE);
-
- } /* end of this while loop for this iteration */
-
- skip:
-
- total_sent += sent_this_iter;
-
- DPRINTF("\r %d: sent %d, skipped %d, ",
- iter, sent_this_iter, skip_this_iter );
-
- if ( last_iter )
- {
- print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
- DPRINTF("Total pages sent= %ld (%.2fx)\n",
- total_sent, ((float)total_sent)/pfn_array_size );
- }
-
- if ( last_iter && debug )
- {
- int minusone = -1;
- memset(to_send, 0xff, BITMAP_SIZE);
- debug = 0;
- DPRINTF("Entering debug resend-all mode\n");
-
- /* send "-1" to put receiver into debug mode */
- if ( !write_exact(io_fd, &minusone, sizeof(int)) )
- {
- ERROR("Error when writing to state file (6)");
- goto out;
- }
-
- continue;
- }
-
- if ( last_iter )
- break;
-
- if ( live )
- {
- if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
- (iter >= max_iters) ||
- (sent_this_iter+skip_this_iter < 50) ||
- (total_sent > pfn_array_size*max_factor) )
- {
- DPRINTF("Start last iteration for HVM domain\n");
- last_iter = 1;
-
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
- &ctxt))
- {
- ERROR("Domain appears not to have suspended");
- goto out;
- }
-
- DPRINTF("SUSPEND eip %08lx edx %08lx\n",
- (unsigned long)ctxt.user_regs.eip,
- (unsigned long)ctxt.user_regs.edx);
- }
-
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
- pfn_array_size, NULL,
- 0, &stats) != pfn_array_size )
- {
- ERROR("Error flushing shadow PT");
- goto out;
- }
-
- /* Pull in the dirty bits from qemu too */
- if ( !last_iter )
- {
- qemu_active = qemu_non_active;
- qemu_non_active = qemu_active ? 0 : 1;
- qemu_flip_buffer(dom, qemu_active);
- for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
- {
- to_send[j] |= qemu_bitmaps[qemu_non_active][j];
- qemu_bitmaps[qemu_non_active][j] = 0;
- }
- }
- else
- {
- for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
- to_send[j] |= qemu_bitmaps[qemu_active][j];
- }
-
- sent_last_iter = sent_this_iter;
-
- print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
- }
- } /* end of while 1 */
-
-
- DPRINTF("All HVM memory is saved\n");
-
- {
- struct {
- int minustwo;
- int max_vcpu_id;
- uint64_t vcpumap;
- } chunk = { -2, info.max_vcpu_id };
-
- if (info.max_vcpu_id >= 64) {
- ERROR("Too many VCPUS in guest!");
- goto out;
- }
-
- for (i = 1; i <= info.max_vcpu_id; i++) {
- xc_vcpuinfo_t vinfo;
- if ((xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
- vinfo.online)
- vcpumap |= 1ULL << i;
- }
-
- chunk.vcpumap = vcpumap;
- if(!write_exact(io_fd, &chunk, sizeof(chunk))) {
- ERROR("Error when writing to state file (errno %d)", errno);
- goto out;
- }
- }
-
- /* Zero terminate */
- i = 0;
- if ( !write_exact(io_fd, &i, sizeof(int)) )
- {
- ERROR("Error when writing to state file (6)");
- goto out;
- }
-
- /* Save magic-page locations. */
- memset(magic_pfns, 0, sizeof(magic_pfns));
- xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
- (unsigned long *)&magic_pfns[0]);
- xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
- (unsigned long *)&magic_pfns[1]);
- xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
- (unsigned long *)&magic_pfns[2]);
- if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
- {
- ERROR("Error when writing to state file (7)");
- goto out;
- }
-
- /* save vcpu/vmcs contexts */
- for ( i = 0; i < nr_vcpus; i++ )
- {
- if ( !(vcpumap & (1ULL << i)) )
- continue;
-
- if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
- {
- ERROR("HVM:Could not get vcpu context");
- goto out;
- }
-
- DPRINTF("write vcpu %d context.\n", i);
- if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
- {
- ERROR("write vcpu context failed!\n");
- goto out;
- }
- }
-
- if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
- hvm_buf_size)) == -1 )
- {
- ERROR("HVM:Could not get hvm buffer");
- goto out;
- }
-
- if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
- {
- ERROR("error write hvm buffer size");
- goto out;
- }
-
- if ( !write_exact(io_fd, hvm_buf, rec_size) )
- {
- ERROR("write HVM info failed!\n");
- goto out;
- }
-
- /* Success! */
- rc = 0;
-
- out:
-
- if ( live )
- {
- if ( xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
- NULL, 0, NULL, 0, NULL) < 0 )
- DPRINTF("Warning - couldn't disable shadow mode");
- }
-
- free(hvm_buf);
- free(pfn_batch);
- free(to_send);
- free(to_skip);
-
- return !!rc;
-}
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c Wed Apr 11 09:29:00 2007 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,1414 +0,0 @@
-/******************************************************************************
- * xc_linux_save.c
- *
- * Save the state of a running Linux session.
- *
- * Copyright (c) 2003, K A Fraser.
- */
-
-#include <inttypes.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/time.h>
-
-#include "xc_private.h"
-#include "xc_dom.h"
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-/*
-** Default values for important tuning parameters. Can override by passing
-** non-zero replacement values to xc_linux_save().
-**
-** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
-**
-*/
-#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
-#define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
-
-/* max mfn of the whole machine */
-static unsigned long max_mfn;
-
-/* virtual starting address of the hypervisor */
-static unsigned long hvirt_start;
-
-/* #levels of page tables used by the current guest */
-static unsigned int pt_levels;
-
-/* number of pfns this guest has (i.e. number of entries in the P2M) */
-static unsigned long p2m_size;
-
-/* Live mapping of the table mapping each PFN to its current MFN. */
-static xen_pfn_t *live_p2m = NULL;
-
-/* Live mapping of system MFN to PFN table. */
-static xen_pfn_t *live_m2p = NULL;
-static unsigned long m2p_mfn0;
-
-/* grep fodder: machine_to_phys */
-
-#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
-
-/*
- * Returns TRUE if the given machine frame number has a unique mapping
- * in the guest's pseudophysical map.
- */
-#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
- (((_mfn) < (max_mfn)) && \
- ((mfn_to_pfn(_mfn) < (p2m_size)) && \
- (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
-
-/* Returns TRUE if MFN is successfully converted to a PFN. */
-#define translate_mfn_to_pfn(_pmfn) \
-({ \
- unsigned long mfn = *(_pmfn); \
- int _res = 1; \
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
- _res = 0; \
- else \
- *(_pmfn) = mfn_to_pfn(mfn); \
- _res; \
-})
-
-/*
-** During (live) save/migrate, we maintain a number of bitmaps to track
-** which pages we have to send, to fixup, and to skip.
-*/
-
-#define BITS_PER_LONG (sizeof(unsigned long) * 8)
-#define BITMAP_SIZE ((p2m_size + BITS_PER_LONG - 1) / 8)
-
-#define BITMAP_ENTRY(_nr,_bmap) \
- ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
-
-#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
-
-static inline int test_bit (int nr, volatile void * addr)
-{
- return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
-}
-
-static inline void clear_bit (int nr, volatile void * addr)
-{
- BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
-}
-
-static inline void set_bit ( int nr, volatile void * addr)
-{
- BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
-}
-
-/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
-static inline unsigned int hweight32(unsigned int w)
-{
- unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
- res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
- res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
- res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
- return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
-}
-
-static inline int count_bits ( int nr, volatile void *addr)
-{
- int i, count = 0;
- volatile unsigned long *p = (volatile unsigned long *)addr;
- /* We know that the array is padded to unsigned long. */
- for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
- count += hweight32(*p);
- return count;
-}
-
-static inline int permute( int i, int nr, int order_nr )
-{
- /* Need a simple permutation function so that we scan pages in a
- pseudo random order, enabling us to get a better estimate of
- the domain's page dirtying rate as we go (there are often
- contiguous ranges of pfns that have similar behaviour, and we
- want to mix them up. */
-
- /* e.g. nr->oder 15->4 16->4 17->5 */
- /* 512MB domain, 128k pages, order 17 */
-
- /*
- QPONMLKJIHGFEDCBA
- QPONMLKJIH
- GFEDCBA
- */
-
- /*
- QPONMLKJIHGFEDCBA
- EDCBA
- QPONM
- LKJIHGF
- */
-
- do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
- while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
-
- return i;
-}
-
-static uint64_t tv_to_us(struct timeval *new)
-{
- return (new->tv_sec * 1000000) + new->tv_usec;
-}
-
-static uint64_t llgettimeofday(void)
-{
- struct timeval now;
- gettimeofday(&now, NULL);
- return tv_to_us(&now);
-}
-
-static uint64_t tv_delta(struct timeval *new, struct timeval *old)
-{
- return (((new->tv_sec - old->tv_sec)*1000000) +
- (new->tv_usec - old->tv_usec));
-}
-
-static int noncached_write(int fd, int live, void *buffer, int len)
-{
- static int write_count = 0;
-
- int rc = write(fd,buffer,len);
-
- write_count += len;
- if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
- {
- /* Time to discard cache - dont care if this fails */
- discard_file_cache(fd, 0 /* no flush */);
- write_count = 0;
- }
-
- return rc;
-}
-
-#ifdef ADAPTIVE_SAVE
-
-/*
-** We control the rate at which we transmit (or save) to minimize impact
-** on running domains (including the target if we're doing live migrate).
-*/
-
-#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
-#define START_MBIT_RATE 100 /* initial transmit rate for migrate */
-
-/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
-#define RATE_TO_BTU 781250
-
-/* Amount in bytes we allow ourselves to send in a burst */
-#define BURST_BUDGET (100*1024)
-
-/* We keep track of the current and previous transmission rate */
-static int mbit_rate, ombit_rate = 0;
-
-/* Have we reached the maximum transmission rate? */
-#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
-
-static inline void initialize_mbit_rate()
-{
- mbit_rate = START_MBIT_RATE;
-}
-
-static int ratewrite(int io_fd, int live, void *buf, int n)
-{
- static int budget = 0;
- static int burst_time_us = -1;
- static struct timeval last_put = { 0 };
- struct timeval now;
- struct timespec delay;
- long long delta;
-
- if ( START_MBIT_RATE == 0 )
- return noncached_write(io_fd, live, buf, n);
-
- budget -= n;
- if ( budget < 0 )
- {
- if ( mbit_rate != ombit_rate )
- {
- burst_time_us = RATE_TO_BTU / mbit_rate;
- ombit_rate = mbit_rate;
- DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
- mbit_rate, BURST_BUDGET, burst_time_us);
- }
- if ( last_put.tv_sec == 0 )
- {
- budget += BURST_BUDGET;
- gettimeofday(&last_put, NULL);
- }
- else
- {
- while ( budget < 0 )
- {
- gettimeofday(&now, NULL);
- delta = tv_delta(&now, &last_put);
- while ( delta > burst_time_us )
- {
- budget += BURST_BUDGET;
- last_put.tv_usec += burst_time_us;
- if ( last_put.tv_usec > 1000000
- {
- last_put.tv_usec -= 1000000;
- last_put.tv_sec++;
- }
- delta -= burst_time_us;
- }
- if ( budget > 0 )
- break;
- delay.tv_sec = 0;
- delay.tv_nsec = 1000 * (burst_time_us - delta);
- while ( delay.tv_nsec > 0 )
- if ( nanosleep(&delay, &delay) == 0 )
- break;
- }
- }
- }
- return noncached_write(io_fd, live, buf, n);
-}
-
-#else /* ! ADAPTIVE SAVE */
-
-#define RATE_IS_MAX() (0)
-#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live),
(_buf), (_n))
-#define initialize_mbit_rate()
-
-#endif
-
-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
- return (write(fd, buf, count) == count);
-}
-
-static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
- xc_shadow_op_stats_t *stats, int print)
-{
- static struct timeval wall_last;
- static long long d0_cpu_last;
- static long long d1_cpu_last;
-
- struct timeval wall_now;
- long long wall_delta;
- long long d0_cpu_now, d0_cpu_delta;
- long long d1_cpu_now, d1_cpu_delta;
-
- gettimeofday(&wall_now, NULL);
-
- d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
- d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
-
- if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
- DPRINTF("ARRHHH!!\n");
-
- wall_delta = tv_delta(&wall_now,&wall_last)/1000;
- if ( wall_delta == 0 )
- wall_delta = 1;
-
- d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
- d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
-
- if ( print )
- DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
- "dirtied %dMb/s %" PRId32 " pages\n",
- wall_delta,
- (int)((d0_cpu_delta*100)/wall_delta),
- (int)((d1_cpu_delta*100)/wall_delta),
- (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
- (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
- stats->dirty_count);
-
-#ifdef ADAPTIVE_SAVE
- if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
- {
- mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
- + 50;
- if ( mbit_rate > MAX_MBIT_RATE )
- mbit_rate = MAX_MBIT_RATE;
- }
-#endif
-
- d0_cpu_last = d0_cpu_now;
- d1_cpu_last = d1_cpu_now;
- wall_last = wall_now;
-
- return 0;
-}
-
-
-static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
- unsigned long *arr, int runs)
-{
- long long start, now;
- xc_shadow_op_stats_t stats;
- int j;
-
- start = llgettimeofday();
-
- for ( j = 0; j < runs; j++ )
- {
- int i;
-
- xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
- arr, p2m_size, NULL, 0, NULL);
- DPRINTF("#Flush\n");
- for ( i = 0; i < 40; i++ )
- {
- usleep(50000);
- now = llgettimeofday();
- xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
- NULL, 0, NULL, 0, &stats);
- DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
- ((now-start)+500)/1000,
- stats.fault_count, stats.dirty_count);
- }
- }
-
- return -1;
-}
-
-
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
- int dom, xc_dominfo_t *info,
- vcpu_guest_context_t *ctxt)
-{
- int i = 0;
-
- if ( !(*suspend)(dom) )
- {
- ERROR("Suspend request failed");
- return -1;
- }
-
- retry:
-
- if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
- {
- ERROR("Could not get domain info");
- return -1;
- }
-
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
- ERROR("Could not get vcpu context");
-
-
- if ( info->dying )
- {
- ERROR("domain is dying");
- return -1;
- }
-
- if ( info->crashed )
- {
- ERROR("domain has crashed");
- return -1;
- }
-
- if ( info->shutdown )
- {
- switch ( info->shutdown_reason )
- {
- case SHUTDOWN_poweroff:
- case SHUTDOWN_reboot:
- ERROR("domain has shut down");
- return -1;
- case SHUTDOWN_suspend:
- return 0;
- case SHUTDOWN_crash:
- ERROR("domain has crashed");
- return -1;
- }
- }
-
- if ( info->paused )
- {
- /* Try unpausing domain, wait, and retest. */
- xc_domain_unpause( xc_handle, dom );
- ERROR("Domain was paused. Wait and re-test.");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- if ( ++i < 100 )
- {
- ERROR("Retry suspend domain");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- ERROR("Unable to suspend domain.");
-
- return -1;
-}
-
-/*
-** Map the top-level page of MFNs from the guest. The guest might not have
-** finished resuming from a previous restore operation, so we wait a while for
-** it to update the MFN to a reasonable value.
-*/
-static void *map_frame_list_list(int xc_handle, uint32_t dom,
- shared_info_t *shinfo)
-{
- int count = 100;
- void *p;
-
- while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
- usleep(10000);
-
- if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
- {
- ERROR("Timed out waiting for frame list updated.");
- return NULL;
- }
-
- p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
- shinfo->arch.pfn_to_mfn_frame_list_list);
- if ( p == NULL )
- ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
-
- return p;
-}
-
-/*
-** During transfer (or in the state file), all page-table pages must be
-** converted into a 'canonical' form where references to actual mfns
-** are replaced with references to the corresponding pfns.
-**
-** This function performs the appropriate conversion, taking into account
-** which entries do not require canonicalization (in particular, those
-** entries which map the virtual address reserved for the hypervisor).
-*/
-static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
- const void *spage, void *dpage)
-{
-
- int i, pte_last, xen_start, xen_end, race = 0;
- uint64_t pte;
-
- /*
- ** We need to determine which entries in this page table hold
- ** reserved hypervisor mappings. This depends on the current
- ** page table type as well as the number of paging levels.
- */
- xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
-
- if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
- xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
-
- if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
- xen_start = L3_PAGETABLE_ENTRIES_PAE;
-
- /*
- ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
- ** We can spot this by looking for the guest linear mapping which
- ** Xen always ensures is present in that L2. Guests must ensure
- ** that this check will fail for other L2s.
- */
- if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
- {
- int hstart;
- uint64_t he;
-
- hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
- he = ((const uint64_t *) spage)[hstart];
-
- if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
- {
- /* hvirt starts with xen stuff... */
- xen_start = hstart;
- }
- else if ( hvirt_start != 0xf5800000 )
- {
- /* old L2s from before hole was shrunk... */
- hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
- he = ((const uint64_t *) spage)[hstart];
- if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
- xen_start = hstart;
- }
- }
-
- if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
- {
- /*
- ** XXX SMH: should compute these from hvirt_start (which we have)
- ** and hvirt_end (which we don't)
- */
- xen_start = 256;
- xen_end = 272;
- }
-
- /* Now iterate through the page table, canonicalizing each PTE */
- for (i = 0; i < pte_last; i++ )
- {
- unsigned long pfn, mfn;
-
- if ( pt_levels == 2 )
- pte = ((const uint32_t*)spage)[i];
- else
- pte = ((const uint64_t*)spage)[i];
-
- if ( (i >= xen_start) && (i < xen_end) )
- pte = 0;
-
- if ( pte & _PAGE_PRESENT )
- {
- mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
- {
- /* This will happen if the type info is stale which
- is quite feasible under live migration */
- pfn = 0; /* zap it - we'll retransmit this page later */
- race = 1; /* inform the caller of race; fatal if !live */
- }
- else
- pfn = mfn_to_pfn(mfn);
-
- pte &= ~MADDR_MASK_X86;
- pte |= (uint64_t)pfn << PAGE_SHIFT;
-
- /*
- * PAE guest L3Es can contain these flags when running on
- * a 64bit hypervisor. We zap these here to avoid any
- * surprise at restore time...
- */
- if ( (pt_levels == 3) &&
- (type == XEN_DOMCTL_PFINFO_L3TAB) &&
- (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
- pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
- }
-
- if ( pt_levels == 2 )
- ((uint32_t*)dpage)[i] = pte;
- else
- ((uint64_t*)dpage)[i] = pte;
- }
-
- return race;
-}
-
-static xen_pfn_t *xc_map_m2p(int xc_handle,
- unsigned long max_mfn,
- int prot)
-{
- struct xen_machphys_mfn_list xmml;
- privcmd_mmap_entry_t *entries;
- unsigned long m2p_chunks, m2p_size;
- xen_pfn_t *m2p;
- xen_pfn_t *extent_start;
- int i, rc;
-
- m2p_size = M2P_SIZE(max_mfn);
- m2p_chunks = M2P_CHUNKS(max_mfn);
-
- xmml.max_extents = m2p_chunks;
- if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
- {
- ERROR("failed to allocate space for m2p mfns");
- return NULL;
- }
- set_xen_guest_handle(xmml.extent_start, extent_start);
-
- if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
- (xmml.nr_extents != m2p_chunks) )
- {
- ERROR("xc_get_m2p_mfns");
- return NULL;
- }
-
- if ( (m2p = mmap(NULL, m2p_size, prot,
- MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
- {
- ERROR("failed to mmap m2p");
- return NULL;
- }
-
- if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
- {
- ERROR("failed to allocate space for mmap entries");
- return NULL;
- }
-
- for ( i = 0; i < m2p_chunks; i++ )
- {
- entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
- entries[i].mfn = extent_start[i];
- entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
- }
-
- if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
- entries, m2p_chunks)) < 0 )
- {
- ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
- return NULL;
- }
-
- m2p_mfn0 = entries[0].mfn;
-
- free(extent_start);
- free(entries);
-
- return m2p;
-}
-
-int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(int))
-{
- xc_dominfo_t info;
-
- int rc = 1, i, j, last_iter, iter = 0;
- int live = (flags & XCFLAGS_LIVE);
- int debug = (flags & XCFLAGS_DEBUG);
- int race = 0, sent_last_iter, skip_this_iter;
-
- /* The new domain's shared-info frame number. */
- unsigned long shared_info_frame;
-
- /* A copy of the CPU context of the guest. */
- vcpu_guest_context_t ctxt;
-
- /* A table containg the type of each PFN (/not/ MFN!). */
- unsigned long *pfn_type = NULL;
- unsigned long *pfn_batch = NULL;
-
- /* A temporary mapping, and a copy, of one frame of guest memory. */
- char page[PAGE_SIZE];
-
- /* Double and single indirect references to the live P2M table */
- xen_pfn_t *live_p2m_frame_list_list = NULL;
- xen_pfn_t *live_p2m_frame_list = NULL;
-
- /* A copy of the pfn-to-mfn table frame list. */
- xen_pfn_t *p2m_frame_list = NULL;
-
- /* Live mapping of shared info structure */
- shared_info_t *live_shinfo = NULL;
-
- /* base of the region in which domain memory is mapped */
- unsigned char *region_base = NULL;
-
- /* power of 2 order of p2m_size */
- int order_nr;
-
- /* bitmap of pages:
- - that should be sent this iteration (unless later marked as skip);
- - to skip this iteration because already dirty;
- - to fixup by sending at the end if not already resent; */
- unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
-
- xc_shadow_op_stats_t stats;
-
- unsigned long needed_to_fix = 0;
- unsigned long total_sent = 0;
-
- uint64_t vcpumap = 1ULL;
-
- /* If no explicit control parameters given, use defaults */
- max_iters = max_iters ? : DEF_MAX_ITERS;
- max_factor = max_factor ? : DEF_MAX_FACTOR;
-
- initialize_mbit_rate();
-
- if ( !get_platform_info(xc_handle, dom,
- &max_mfn, &hvirt_start, &pt_levels) )
- {
- ERROR("Unable to get platform info.");
- return 1;
- }
-
- if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
- {
- ERROR("Could not get domain info");
- return 1;
- }
-
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
- {
- ERROR("Could not get vcpu context");
- goto out;
- }
- shared_info_frame = info.shared_info_frame;
-
- /* Map the shared info frame */
- if ( !(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
- PROT_READ, shared_info_frame)) )
- {
- ERROR("Couldn't map live_shinfo");
- goto out;
- }
-
- p2m_size = live_shinfo->arch.max_pfn;
-
- live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
- live_shinfo);
- if ( !live_p2m_frame_list_list )
- goto out;
-
- live_p2m_frame_list =
- xc_map_foreign_batch(xc_handle, dom, PROT_READ,
- live_p2m_frame_list_list,
- P2M_FLL_ENTRIES);
- if ( !live_p2m_frame_list )
- {
- ERROR("Couldn't map p2m_frame_list");
- goto out;
- }
-
- /* Map all the frames of the pfn->mfn table. For migrate to succeed,
- the guest must not change which frames are used for this purpose.
- (its not clear why it would want to change them, and we'll be OK
- from a safety POV anyhow. */
-
- live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
- live_p2m_frame_list,
- P2M_FL_ENTRIES);
- if ( !live_p2m )
- {
- ERROR("Couldn't map p2m table");
- goto out;
- }
-
- /* Setup the mfn_to_pfn table mapping */
- if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
- {
- ERROR("Failed to map live M2P table");
- goto out;
- }
-
-
- /* Get a local copy of the live_P2M_frame_list */
- if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
- {
- ERROR("Couldn't allocate p2m_frame_list array");
- goto out;
- }
- memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
-
- /* Canonicalise the pfn-to-mfn table frame-number list. */
- for ( i = 0; i < p2m_size; i += fpp )
- {
- if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
- {
- ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
- ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
- (uint64_t)p2m_frame_list[i/fpp]);
- goto out;
- }
- }
-
- /* Domain is still running at this point */
- if ( live )
- {
- /* Live suspend. Enable log-dirty mode. */
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL, 0, NULL) < 0 )
- {
- ERROR("Couldn't enable shadow mode");
- goto out;
- }
- }
- else
- {
- /* This is a non-live suspend. Suspend the domain .*/
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
- {
- ERROR("Domain appears not to have suspended");
- goto out;
- }
- }
-
- last_iter = !live;
-
- /* pretend we sent all the pages last iteration */
- sent_last_iter = p2m_size;
-
- /* calculate the power of 2 order of p2m_size, e.g.
- 15->4 16->4 17->5 */
- for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
- continue;
-
- /* Setup to_send / to_fix and to_skip bitmaps */
- to_send = malloc(BITMAP_SIZE);
- to_fix = calloc(1, BITMAP_SIZE);
- to_skip = malloc(BITMAP_SIZE);
-
- if ( !to_send || !to_fix || !to_skip )
- {
- ERROR("Couldn't allocate to_send array");
- goto out;
- }
-
- memset(to_send, 0xff, BITMAP_SIZE);
-
- if ( lock_pages(to_send, BITMAP_SIZE) )
- {
- ERROR("Unable to lock to_send");
- return 1;
- }
-
- /* (to fix is local only) */
- if ( lock_pages(to_skip, BITMAP_SIZE) )
- {
- ERROR("Unable to lock to_skip");
- return 1;
- }
-
- analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
-
- /* We want zeroed memory so use calloc rather than malloc. */
- pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
- pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
- if ( (pfn_type == NULL) || (pfn_batch == NULL) )
- {
- ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
- errno = ENOMEM;
- goto out;
- }
-
- if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
- {
- ERROR("Unable to lock");
- goto out;
- }
-
- /*
- * Quick belt and braces sanity check.
- */
- {
- int err=0;
- unsigned long mfn;
- for ( i = 0; i < p2m_size; i++ )
- {
- mfn = live_p2m[i];
- if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
- {
- DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
- mfn, mfn_to_pfn(mfn));
- err++;
- }
- }
- DPRINTF("Had %d unexplained entries in p2m table\n", err);
- }
-
- /* Start writing out the saved-domain record. */
- if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
- {
- ERROR("write: p2m_size");
- goto out;
- }
-
- /*
- * Write an extended-info structure to inform the restore code that
- * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
- * slow paths in the restore code.
- */
- if ( (pt_levels == 3) &&
- (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
- {
- unsigned long signature = ~0UL;
- uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8;
- uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
- char chunk_sig[] = "vcpu";
- if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
- !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
- !write_exact(io_fd, &chunk_sig, 4) ||
- !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) ||
- !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
- {
- ERROR("write: extended info");
- goto out;
- }
- }
-
- if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
- {
- ERROR("write: p2m_frame_list");
- goto out;
- }
-
- print_stats(xc_handle, dom, 0, &stats, 0);
-
- /* Now write out each data page, canonicalising page tables as we go... */
- for ( ; ; )
- {
- unsigned int prev_pc, sent_this_iter, N, batch;
-
- iter++;
- sent_this_iter = 0;
- skip_this_iter = 0;
- prev_pc = 0;
- N = 0;
-
- DPRINTF("Saving memory pages: iter %d 0%%", iter);
-
- while ( N < p2m_size )
- {
- unsigned int this_pc = (N * 100) / p2m_size;
- int rc;
-
- if ( (this_pc - prev_pc) >= 5 )
- {
- DPRINTF("\b\b\b\b%3d%%", this_pc);
- prev_pc = this_pc;
- }
-
- if ( !last_iter )
- {
- /* Slightly wasteful to peek the whole array evey time,
- but this is fast enough for the moment. */
- rc = xc_shadow_control(
- xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
- p2m_size, NULL, 0, NULL);
- if ( rc != p2m_size )
- {
- ERROR("Error peeking shadow bitmap");
- goto out;
- }
- }
-
- /* load pfn_type[] with the mfn of all the pages we're doing in
- this batch. */
- for ( batch = 0;
- (batch < MAX_BATCH_SIZE) && (N < p2m_size);
- N++ )
- {
- int n = permute(N, p2m_size, order_nr);
-
- if ( debug )
- DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
- iter, (unsigned long)n, live_p2m[n],
- test_bit(n, to_send),
- mfn_to_pfn(live_p2m[n]&0xFFFFF));
-
- if ( !last_iter &&
- test_bit(n, to_send) &&
- test_bit(n, to_skip) )
- skip_this_iter++; /* stats keeping */
-
- if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
- (test_bit(n, to_send) && last_iter) ||
- (test_bit(n, to_fix) && last_iter)) )
- continue;
-
- /*
- ** we get here if:
- ** 1. page is marked to_send & hasn't already been re-dirtied
- ** 2. (ignore to_skip in last iteration)
- ** 3. add in pages that still need fixup (net bufs)
- */
-
- pfn_batch[batch] = n;
- pfn_type[batch] = live_p2m[n];
-
- if ( !is_mapped(pfn_type[batch]) )
- {
- /*
- ** not currently in psuedo-physical map -- set bit
- ** in to_fix since we must send this page in last_iter
- ** unless its sent sooner anyhow, or it never enters
- ** pseudo-physical map (e.g. for ballooned down domains)
- */
- set_bit(n, to_fix);
- continue;
- }
-
- if ( last_iter &&
- test_bit(n, to_fix) &&
- !test_bit(n, to_send) )
- {
- needed_to_fix++;
- DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
- iter, n, pfn_type[batch]);
- }
-
- clear_bit(n, to_fix);
-
- batch++;
- }
-
- if ( batch == 0 )
- goto skip; /* vanishingly unlikely... */
-
- region_base = xc_map_foreign_batch(
- xc_handle, dom, PROT_READ, pfn_type, batch);
- if ( region_base == NULL )
- {
- ERROR("map batch failed");
- goto out;
- }
-
- for ( j = 0; j < batch; j++ )
- ((uint32_t *)pfn_type)[j] = pfn_type[j];
- if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
- (uint32_t *)pfn_type) )
- {
- ERROR("get_pfn_type_batch failed");
- goto out;
- }
- for ( j = batch-1; j >= 0; j-- )
- pfn_type[j] = ((uint32_t *)pfn_type)[j];
-
- for ( j = 0; j < batch; j++ )
- {
-
- if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
- XEN_DOMCTL_PFINFO_XTAB )
- {
- DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
- continue;
- }
-
- if ( debug )
- DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
- " sum= %08lx\n",
- iter,
- (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
- pfn_batch[j],
- pfn_type[j],
- mfn_to_pfn(pfn_type[j] &
- ~XEN_DOMCTL_PFINFO_LTAB_MASK),
- csum_page(region_base + (PAGE_SIZE*j)));
-
- /* canonicalise mfn->pfn */
- pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
- pfn_batch[j];
- }
-
- if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
- {
- ERROR("Error when writing to state file (2) (errno %d)",
- errno);
- goto out;
- }
-
- if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*j) )
- {
- ERROR("Error when writing to state file (3) (errno %d)",
- errno);
- goto out;
- }
-
- /* entering this loop, pfn_type is now in pfns (Not mfns) */
- for ( j = 0; j < batch; j++ )
- {
- unsigned long pfn, pagetype;
- void *spage = (char *)region_base + (PAGE_SIZE*j);
-
- pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- /* write out pages in batch */
- if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
- continue;
-
- pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
- if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
- (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
- {
- /* We have a pagetable page: need to rewrite it. */
- race =
- canonicalize_pagetable(pagetype, pfn, spage, page);
-
- if ( race && !live )
- {
- ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
- pagetype);
- goto out;
- }
-
- if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
- {
- ERROR("Error when writing to state file (4)"
- " (errno %d)", errno);
- goto out;
- }
- }
- else
- {
- /* We have a normal page: just write it directly. */
- if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
- PAGE_SIZE )
- {
- ERROR("Error when writing to state file (5)"
- " (errno %d)", errno);
- goto out;
- }
- }
- } /* end of the write out for this batch */
-
- sent_this_iter += batch;
-
- munmap(region_base, batch*PAGE_SIZE);
-
- } /* end of this while loop for this iteration */
-
- skip:
-
- total_sent += sent_this_iter;
-
- DPRINTF("\r %d: sent %d, skipped %d, ",
- iter, sent_this_iter, skip_this_iter );
-
- if ( last_iter )
- {
- print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
-
- DPRINTF("Total pages sent= %ld (%.2fx)\n",
- total_sent, ((float)total_sent)/p2m_size );
- DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
- }
-
- if ( last_iter && debug )
- {
- int minusone = -1;
- memset(to_send, 0xff, BITMAP_SIZE);
- debug = 0;
- DPRINTF("Entering debug resend-all mode\n");
-
- /* send "-1" to put receiver into debug mode */
- if ( !write_exact(io_fd, &minusone, sizeof(int)) )
- {
- ERROR("Error when writing to state file (6) (errno %d)",
- errno);
- goto out;
- }
-
- continue;
- }
-
- if ( last_iter )
- break;
-
- if ( live )
- {
- if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
- (iter >= max_iters) ||
- (sent_this_iter+skip_this_iter < 50) ||
- (total_sent > p2m_size*max_factor) )
- {
- DPRINTF("Start last iteration\n");
- last_iter = 1;
-
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
- &ctxt) )
- {
- ERROR("Domain appears not to have suspended");
- goto out;
- }
-
- DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
- info.shared_info_frame,
- (unsigned long)ctxt.user_regs.eip,
- (unsigned long)ctxt.user_regs.edx);
- }
-
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
- p2m_size, NULL, 0, &stats) != p2m_size )
- {
- ERROR("Error flushing shadow PT");
- goto out;
- }
-
- sent_last_iter = sent_this_iter;
-
- print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
-
- }
- } /* end of infinite for loop */
-
- DPRINTF("All memory is saved\n");
-
- {
- struct {
- int minustwo;
- int max_vcpu_id;
- uint64_t vcpumap;
- } chunk = { -2, info.max_vcpu_id };
-
- if ( info.max_vcpu_id >= 64 )
- {
- ERROR("Too many VCPUS in guest!");
- goto out;
- }
-
- for ( i = 1; i <= info.max_vcpu_id; i++ )
- {
- xc_vcpuinfo_t vinfo;
- if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
- vinfo.online )
- vcpumap |= 1ULL << i;
- }
-
- chunk.vcpumap = vcpumap;
- if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
- {
- ERROR("Error when writing to state file (errno %d)", errno);
- goto out;
- }
- }
-
- /* Zero terminate */
- i = 0;
- if ( !write_exact(io_fd, &i, sizeof(int)) )
- {
- ERROR("Error when writing to state file (6') (errno %d)", errno);
- goto out;
- }
-
- /* Send through a list of all the PFNs that were not in map at the close */
- {
- unsigned int i,j;
- unsigned long pfntab[1024];
-
- for ( i = 0, j = 0; i < p2m_size; i++ )
- {
- if ( !is_mapped(live_p2m[i]) )
- j++;
- }
-
- if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
- {
- ERROR("Error when writing to state file (6a) (errno %d)", errno);
- goto out;
- }
-
- for ( i = 0, j = 0; i < p2m_size; )
- {
- if ( !is_mapped(live_p2m[i]) )
- pfntab[j++] = i;
-
- i++;
- if ( (j == 1024) || (i == p2m_size) )
- {
- if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
- {
- ERROR("Error when writing to state file (6b) (errno %d)",
- errno);
- goto out;
- }
- j = 0;
- }
- }
- }
-
- /* Canonicalise the suspend-record frame number. */
- if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
- {
- ERROR("Suspend record is not in range of pseudophys map");
- goto out;
- }
-
- for ( i = 0; i <= info.max_vcpu_id; i++ )
- {
- if ( !(vcpumap & (1ULL << i)) )
- continue;
-
- if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
- {
- ERROR("No context for VCPU%d", i);
- goto out;
- }
-
- /* Canonicalise each GDT frame number. */
- for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
- {
- if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
- {
- ERROR("GDT frame is not in range of pseudophys map");
- goto out;
- }
- }
-
- /* Canonicalise the page table base pointer. */
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
- {
- ERROR("PT base is not in range of pseudophys map");
- goto out;
- }
- ctxt.ctrlreg[3] =
- xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
-
- /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
- if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
- {
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
- {
- ERROR("PT base is not in range of pseudophys map");
- goto out;
- }
- /* Least-significant bit means 'valid PFN'. */
- ctxt.ctrlreg[1] = 1 |
- xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
- }
-
- if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
- {
- ERROR("Error when writing to state file (1) (errno %d)", errno);
- goto out;
- }
- }
-
- /*
- * Reset the MFN to be a known-invalid value. See map_frame_list_list().
- */
- memcpy(page, live_shinfo, PAGE_SIZE);
- ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
- if ( !write_exact(io_fd, page, PAGE_SIZE) )
- {
- ERROR("Error when writing to state file (1) (errno %d)", errno);
- goto out;
- }
-
- /* Success! */
- rc = 0;
-
- out:
-
- if ( live )
- {
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_OFF,
- NULL, 0, NULL, 0, NULL) < 0 )
- DPRINTF("Warning - couldn't disable shadow mode");
- }
-
- /* Flush last write and discard cache for file. */
- discard_file_cache(io_fd, 1 /* flush */);
-
- if ( live_shinfo )
- munmap(live_shinfo, PAGE_SIZE);
-
- if ( live_p2m_frame_list_list )
- munmap(live_p2m_frame_list_list, PAGE_SIZE);
-
- if ( live_p2m_frame_list )
- munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
-
- if ( live_p2m )
- munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
-
- if ( live_m2p )
- munmap(live_m2p, M2P_SIZE(max_mfn));
-
- free(pfn_type);
- free(pfn_batch);
- free(to_send);
- free(to_fix);
- free(to_skip);
-
- DPRINTF("Save exit rc=%d\n",rc);
-
- return !!rc;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/libxc/xenguest.h Wed Apr 11 14:45:14 2007 +0100
@@ -16,26 +16,19 @@
/**
- * This function will save a domain running Linux.
+ * This function will save a running domain.
*
* @parm xc_handle a handle to an open hypervisor interface
* @parm fd the file descriptor to save a domain to
* @parm dom the id of the domain
* @return 0 on success, -1 on failure
*/
-int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
- int (*suspend)(int domid));
+int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+ uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
+ int (*suspend)(int domid), int hvm,
+ void *(*init_qemu_maps)(int, unsigned), /* HVM only */
+ void (*qemu_flip_buffer)(int, int)); /* HVM only */
-/**
- * This function will save a hvm domain running unmodified guest.
- * @return 0 on success, -1 on failure
- */
-int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
- int (*suspend)(int domid),
- void *(*init_qemu_maps)(int, unsigned),
- void (*qemu_flip_buffer)(int, int));
/**
* This function will restore a saved domain.
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xg_private.c
--- a/tools/libxc/xg_private.c Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/libxc/xg_private.c Wed Apr 11 14:45:14 2007 +0100
@@ -193,17 +193,6 @@ __attribute__((weak))
uint32_t domid,
int memsize,
const char *image_name)
-{
- errno = ENOSYS;
- return -1;
-}
-
-__attribute__((weak))
- int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags,
- int (*suspend)(int domid),
- void *(*init_qemu_maps)(int, unsigned),
- void (*qemu_flip_buffer)(int, int))
{
errno = ENOSYS;
return -1;
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/xcutils/xc_save.c Wed Apr 11 14:45:14 2007 +0100
@@ -174,12 +174,9 @@ main(int argc, char **argv)
max_f = atoi(argv[4]);
flags = atoi(argv[5]);
- if (flags & XCFLAGS_HVM)
- ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags,
- &suspend, &init_qemu_maps, &qemu_flip_buffer);
- else
- ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags,
- &suspend);
+ ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags,
+ &suspend, !!(flags & XCFLAGS_HVM),
+ &init_qemu_maps, &qemu_flip_buffer);
xc_interface_close(xc_fd);
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|