# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Date 1176302729 -3600
# Node ID db4fcb6093832c24771764bd2cb5af9a2608bca2
# Parent 3d356a2b1c75c2fea9b8eb0643075614e9e3d4fe
# Parent 0d92cd901f809ce898c7c62008cf446a0b295c1c
Merge with xen-ia64-unstable.hg
---
tools/libxc/xc_hvm_save.c | 755 -----
tools/libxc/xc_linux_save.c | 1414 ----------
linux-2.6-xen-sparse/drivers/xen/blkfront/block.h | 14
linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c | 4
linux-2.6-xen-sparse/drivers/xen/core/gnttab.c | 28
linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c | 2
tools/blktap/drivers/block-qcow.c | 10
tools/ioemu/hw/pc.c | 12
tools/ioemu/vl.c | 2
tools/ioemu/vl.h | 5
tools/ioemu/xenstore.c | 214 -
tools/libfsimage/fat/fat.h | 14
tools/libxc/Makefile | 4
tools/libxc/ia64/xc_ia64_linux_save.c | 6
tools/libxc/xc_domain_save.c | 1609 ++++++++++++
tools/libxc/xenguest.h | 19
tools/libxc/xg_private.c | 11
tools/pygrub/src/LiloConf.py | 147 +
tools/pygrub/src/pygrub | 32
tools/python/xen/xend/XendCheckpoint.py | 7
tools/python/xen/xend/server/DevController.py | 1
tools/python/xen/xend/server/netif.py | 88
tools/xcutils/xc_save.c | 9
unmodified_drivers/linux-2.6/platform-pci/evtchn.c | 150 -
unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c | 73
unmodified_drivers/linux-2.6/platform-pci/platform-compat.c | 3
unmodified_drivers/linux-2.6/platform-pci/platform-pci.c | 34
unmodified_drivers/linux-2.6/platform-pci/platform-pci.h | 11
28 files changed, 2150 insertions(+), 2528 deletions(-)
diff -r 3d356a2b1c75 -r db4fcb609383
linux-2.6-xen-sparse/drivers/xen/blkfront/block.h
--- a/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h Wed Apr 11 07:30:02
2007 -0600
+++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h Wed Apr 11 15:45:29
2007 +0100
@@ -55,20 +55,6 @@
#include <asm/io.h>
#include <asm/atomic.h>
#include <asm/uaccess.h>
-
-#if 1
-#define IPRINTK(fmt, args...) \
- printk(KERN_INFO "xen_blk: " fmt, ##args)
-#else
-#define IPRINTK(fmt, args...) ((void)0)
-#endif
-
-#if 1
-#define WPRINTK(fmt, args...) \
- printk(KERN_WARNING "xen_blk: " fmt, ##args)
-#else
-#define WPRINTK(fmt, args...) ((void)0)
-#endif
#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
diff -r 3d356a2b1c75 -r db4fcb609383
linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Wed Apr 11 07:30:02
2007 -0600
+++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Wed Apr 11 15:45:29
2007 +0100
@@ -128,14 +128,12 @@ xlbd_alloc_major_info(int major, int min
break;
}
- printk("Registering block device major %i\n", ptr->major);
if (register_blkdev(ptr->major, ptr->type->devname)) {
- WPRINTK("can't get major %d with name %s\n",
- ptr->major, ptr->type->devname);
kfree(ptr);
return NULL;
}
+ printk("xen-vbd: registered block device major %i\n", ptr->major);
major_info[index] = ptr;
return ptr;
}
diff -r 3d356a2b1c75 -r db4fcb609383
linux-2.6-xen-sparse/drivers/xen/core/gnttab.c
--- a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c Wed Apr 11 07:30:02
2007 -0600
+++ b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c Wed Apr 11 15:45:29
2007 +0100
@@ -60,9 +60,6 @@ static DEFINE_SPINLOCK(gnttab_list_lock)
static DEFINE_SPINLOCK(gnttab_list_lock);
static struct grant_entry *shared;
-#ifndef CONFIG_XEN
-static unsigned long resume_frames;
-#endif
static struct gnttab_free_callback *gnttab_free_callback_list;
@@ -514,6 +511,8 @@ int gnttab_suspend(void)
#include <platform-pci.h>
+static unsigned long resume_frames;
+
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
struct xen_add_to_physmap xatp;
@@ -543,23 +542,17 @@ int gnttab_resume(void)
if (max_nr_gframes < nr_gframes)
return -ENOSYS;
- resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+ if (!resume_frames) {
+ resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+ shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes);
+ if (shared == NULL) {
+ printk("error to ioremap gnttab share frames\n");
+ return -1;
+ }
+ }
gnttab_map(0, nr_gframes - 1);
- shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes);
- if (shared == NULL) {
- printk("error to ioremap gnttab share frames\n");
- return -1;
- }
-
- return 0;
-}
-
-int gnttab_suspend(void)
-{
- iounmap(shared);
- resume_frames = 0;
return 0;
}
@@ -624,7 +617,6 @@ int __devinit gnttab_init(void)
gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
gnttab_free_head = NR_RESERVED_ENTRIES;
- printk("Grant table initialized\n");
return 0;
ini_nomem:
diff -r 3d356a2b1c75 -r db4fcb609383
linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c
--- a/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c Wed Apr 11
07:30:02 2007 -0600
+++ b/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c Wed Apr 11
15:45:29 2007 +0100
@@ -209,6 +209,8 @@ int __xen_suspend(int fast_suspend)
if (fast_suspend) {
xenbus_suspend();
err = stop_machine_run(take_machine_down, &fast_suspend, 0);
+ if (err < 0)
+ xenbus_suspend_cancel();
} else {
err = take_machine_down(&fast_suspend);
}
diff -r 3d356a2b1c75 -r db4fcb609383 tools/blktap/drivers/block-qcow.c
--- a/tools/blktap/drivers/block-qcow.c Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/blktap/drivers/block-qcow.c Wed Apr 11 15:45:29 2007 +0100
@@ -949,8 +949,14 @@ int tdqcow_open (struct disk_driver *dd,
goto fail;
}
init_fds(dd);
- s->fd_end = (final_cluster == 0 ? (s->l1_table_offset + l1_table_size)
:
- (final_cluster + s->cluster_size));
+
+ if (!final_cluster)
+ s->fd_end = s->l1_table_offset + l1_table_size;
+ else {
+ s->fd_end = lseek64(fd, 0, SEEK_END);
+ if (s->fd_end == (off64_t)-1)
+ goto fail;
+ }
return 0;
diff -r 3d356a2b1c75 -r db4fcb609383 tools/ioemu/hw/pc.c
--- a/tools/ioemu/hw/pc.c Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/ioemu/hw/pc.c Wed Apr 11 15:45:29 2007 +0100
@@ -902,7 +902,6 @@ static void pc_init1(uint64_t ram_size,
if (pci_enabled && acpi_enabled) {
piix4_pm_init(pci_bus, piix3_devfn + 3);
}
-#endif /* !CONFIG_DM */
#if 0
/* ??? Need to figure out some way for the user to
@@ -921,6 +920,17 @@ static void pc_init1(uint64_t ram_size,
lsi_scsi_attach(scsi, bdrv, -1);
}
#endif
+#else
+ if (pci_enabled) {
+ void *scsi;
+
+ scsi = lsi_scsi_init(pci_bus, -1);
+ for (i = 0; i < MAX_SCSI_DISKS ; i++) {
+ if (bs_table[i + MAX_DISKS])
+ lsi_scsi_attach(scsi, bs_table[i + MAX_DISKS], -1);
+ }
+ }
+#endif /* !CONFIG_DM */
/* must be done after all PCI devices are instanciated */
/* XXX: should be done in the Bochs BIOS */
if (pci_enabled) {
diff -r 3d356a2b1c75 -r db4fcb609383 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/ioemu/vl.c Wed Apr 11 15:45:29 2007 +0100
@@ -116,7 +116,7 @@ void *ioport_opaque[MAX_IOPORTS];
void *ioport_opaque[MAX_IOPORTS];
IOPortReadFunc *ioport_read_table[3][MAX_IOPORTS];
IOPortWriteFunc *ioport_write_table[3][MAX_IOPORTS];
-BlockDriverState *bs_table[MAX_DISKS], *fd_table[MAX_FD];
+BlockDriverState *bs_table[MAX_DISKS+MAX_SCSI_DISKS], *fd_table[MAX_FD];
int vga_ram_size;
int bios_size;
static DisplayState display_state;
diff -r 3d356a2b1c75 -r db4fcb609383 tools/ioemu/vl.h
--- a/tools/ioemu/vl.h Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/ioemu/vl.h Wed Apr 11 15:45:29 2007 +0100
@@ -818,8 +818,9 @@ int vnc_start_viewer(int port);
/* ide.c */
#define MAX_DISKS 4
-
-extern BlockDriverState *bs_table[MAX_DISKS];
+#define MAX_SCSI_DISKS 7
+
+extern BlockDriverState *bs_table[MAX_DISKS+MAX_SCSI_DISKS];
void isa_ide_init(int iobase, int iobase2, int irq,
BlockDriverState *hd0, BlockDriverState *hd1);
diff -r 3d356a2b1c75 -r db4fcb609383 tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/ioemu/xenstore.c Wed Apr 11 15:45:29 2007 +0100
@@ -30,11 +30,11 @@ static int pasprintf(char **buf, const c
int ret = 0;
if (*buf)
- free(*buf);
+ free(*buf);
va_start(ap, fmt);
if (vasprintf(buf, fmt, ap) == -1) {
- buf = NULL;
- ret = -1;
+ buf = NULL;
+ ret = -1;
}
va_end(ap);
return ret;
@@ -45,11 +45,11 @@ static void insert_media(void *opaque)
int i;
for (i = 0; i < MAX_DISKS; i++) {
- if (media_filename[i] && bs_table[i]) {
- do_change(bs_table[i]->device_name, media_filename[i]);
- free(media_filename[i]);
- media_filename[i] = NULL;
- }
+ if (media_filename[i] && bs_table[i]) {
+ do_change(bs_table[i]->device_name, media_filename[i]);
+ free(media_filename[i]);
+ media_filename[i] = NULL;
+ }
}
}
@@ -57,7 +57,7 @@ void xenstore_check_new_media_present(in
{
if (insert_timer == NULL)
- insert_timer = qemu_new_timer(rt_clock, insert_media, NULL);
+ insert_timer = qemu_new_timer(rt_clock, insert_media, NULL);
qemu_mod_timer(insert_timer, qemu_get_clock(rt_clock) + timeout);
}
@@ -82,8 +82,8 @@ void xenstore_parse_domain_config(int do
char **e = NULL;
char *buf = NULL, *path;
char *fpath = NULL, *bpath = NULL,
- *dev = NULL, *params = NULL, *type = NULL;
- int i;
+ *dev = NULL, *params = NULL, *type = NULL;
+ int i, is_scsi;
unsigned int len, num, hd_index;
for(i = 0; i < MAX_DISKS; i++)
@@ -91,8 +91,8 @@ void xenstore_parse_domain_config(int do
xsh = xs_daemon_open();
if (xsh == NULL) {
- fprintf(logfile, "Could not contact xenstore for domain config\n");
- return;
+ fprintf(logfile, "Could not contact xenstore for domain config\n");
+ return;
}
path = xs_get_domain_path(xsh, domid);
@@ -102,59 +102,60 @@ void xenstore_parse_domain_config(int do
}
if (pasprintf(&buf, "%s/device/vbd", path) == -1)
- goto out;
+ goto out;
e = xs_directory(xsh, XBT_NULL, buf, &num);
if (e == NULL)
- goto out;
+ goto out;
for (i = 0; i < num; i++) {
- /* read the backend path */
- if (pasprintf(&buf, "%s/device/vbd/%s/backend", path, e[i]) == -1)
- continue;
- free(bpath);
+ /* read the backend path */
+ if (pasprintf(&buf, "%s/device/vbd/%s/backend", path, e[i]) == -1)
+ continue;
+ free(bpath);
bpath = xs_read(xsh, XBT_NULL, buf, &len);
- if (bpath == NULL)
- continue;
- /* read the name of the device */
- if (pasprintf(&buf, "%s/dev", bpath) == -1)
- continue;
- free(dev);
- dev = xs_read(xsh, XBT_NULL, buf, &len);
- if (dev == NULL)
- continue;
- if (strncmp(dev, "hd", 2) || strlen(dev) != 3)
- continue;
- hd_index = dev[2] - 'a';
- if (hd_index >= MAX_DISKS)
- continue;
- /* read the type of the device */
- if (pasprintf(&buf, "%s/device/vbd/%s/device-type", path, e[i]) == -1)
- continue;
- free(type);
- type = xs_read(xsh, XBT_NULL, buf, &len);
- if (pasprintf(&buf, "%s/params", bpath) == -1)
- continue;
- free(params);
- params = xs_read(xsh, XBT_NULL, buf, &len);
- if (params == NULL)
- continue;
+ if (bpath == NULL)
+ continue;
+ /* read the name of the device */
+ if (pasprintf(&buf, "%s/dev", bpath) == -1)
+ continue;
+ free(dev);
+ dev = xs_read(xsh, XBT_NULL, buf, &len);
+ if (dev == NULL)
+ continue;
+ is_scsi = !strncmp(dev, "sd", 2);
+ if ((strncmp(dev, "hd", 2) && !is_scsi) || strlen(dev) != 3 )
+ continue;
+ hd_index = dev[2] - 'a';
+ if (hd_index >= (is_scsi ? MAX_SCSI_DISKS : MAX_DISKS))
+ continue;
+ /* read the type of the device */
+ if (pasprintf(&buf, "%s/device/vbd/%s/device-type", path, e[i]) == -1)
+ continue;
+ free(type);
+ type = xs_read(xsh, XBT_NULL, buf, &len);
+ if (pasprintf(&buf, "%s/params", bpath) == -1)
+ continue;
+ free(params);
+ params = xs_read(xsh, XBT_NULL, buf, &len);
+ if (params == NULL)
+ continue;
/*
* check if device has a phantom vbd; the phantom is hooked
* to the frontend device (for ease of cleanup), so lookup
* the frontend device, and see if there is a phantom_vbd
* if there is, we will use resolution as the filename
*/
- if (pasprintf(&buf, "%s/device/vbd/%s/phantom_vbd", path, e[i]) == -1)
- continue;
- free(fpath);
+ if (pasprintf(&buf, "%s/device/vbd/%s/phantom_vbd", path, e[i]) == -1)
+ continue;
+ free(fpath);
fpath = xs_read(xsh, XBT_NULL, buf, &len);
- if (fpath) {
- if (pasprintf(&buf, "%s/dev", fpath) == -1)
- continue;
- free(params);
+ if (fpath) {
+ if (pasprintf(&buf, "%s/dev", fpath) == -1)
+ continue;
+ free(params);
params = xs_read(xsh, XBT_NULL, buf , &len);
- if (params) {
+ if (params) {
/*
* wait for device, on timeout silently fail because we will
* fail to open below
@@ -163,19 +164,20 @@ void xenstore_parse_domain_config(int do
}
}
- bs_table[hd_index] = bdrv_new(dev);
- /* check if it is a cdrom */
- if (type && !strcmp(type, "cdrom")) {
- bdrv_set_type_hint(bs_table[hd_index], BDRV_TYPE_CDROM);
- if (pasprintf(&buf, "%s/params", bpath) != -1)
- xs_watch(xsh, buf, dev);
- }
- /* open device now if media present */
- if (params[0]) {
- if (bdrv_open(bs_table[hd_index], params, 0 /* snapshot */) < 0)
+ bs_table[hd_index + (is_scsi ? MAX_DISKS : 0)] = bdrv_new(dev);
+ /* check if it is a cdrom */
+ if (type && !strcmp(type, "cdrom")) {
+ bdrv_set_type_hint(bs_table[hd_index], BDRV_TYPE_CDROM);
+ if (pasprintf(&buf, "%s/params", bpath) != -1)
+ xs_watch(xsh, buf, dev);
+ }
+ /* open device now if media present */
+ if (params[0]) {
+ if (bdrv_open(bs_table[hd_index + (is_scsi ? MAX_DISKS : 0)],
+ params, 0 /* snapshot */) < 0)
fprintf(stderr, "qemu: could not open hard disk image '%s'\n",
params);
- }
+ }
}
/* Set a watch for log-dirty requests from the migration tools */
@@ -199,7 +201,7 @@ int xenstore_fd(void)
int xenstore_fd(void)
{
if (xsh)
- return xs_fileno(xsh);
+ return xs_fileno(xsh);
return -1;
}
@@ -316,7 +318,7 @@ void xenstore_process_event(void *opaque
vec = xs_read_watch(xsh, &num);
if (!vec)
- return;
+ return;
if (!strcmp(vec[XS_WATCH_TOKEN], "logdirty")) {
xenstore_process_logdirty_event();
@@ -324,23 +326,23 @@ void xenstore_process_event(void *opaque
}
if (strncmp(vec[XS_WATCH_TOKEN], "hd", 2) ||
- strlen(vec[XS_WATCH_TOKEN]) != 3)
- goto out;
+ strlen(vec[XS_WATCH_TOKEN]) != 3)
+ goto out;
hd_index = vec[XS_WATCH_TOKEN][2] - 'a';
image = xs_read(xsh, XBT_NULL, vec[XS_WATCH_PATH], &len);
if (image == NULL || !strcmp(image, bs_table[hd_index]->filename))
- goto out; /* gone or identical */
+ goto out; /* gone or identical */
do_eject(0, vec[XS_WATCH_TOKEN]);
bs_table[hd_index]->filename[0] = 0;
if (media_filename[hd_index]) {
- free(media_filename[hd_index]);
- media_filename[hd_index] = NULL;
+ free(media_filename[hd_index]);
+ media_filename[hd_index] = NULL;
}
if (image[0]) {
- media_filename[hd_index] = strdup(image);
- xenstore_check_new_media_present(5000);
+ media_filename[hd_index] = strdup(image);
+ xenstore_check_new_media_present(5000);
}
out:
@@ -354,7 +356,7 @@ void xenstore_write_vncport(int display)
char *portstr = NULL;
if (xsh == NULL)
- return;
+ return;
path = xs_get_domain_path(xsh, domid);
if (path == NULL) {
@@ -363,10 +365,10 @@ void xenstore_write_vncport(int display)
}
if (pasprintf(&buf, "%s/console/vnc-port", path) == -1)
- goto out;
+ goto out;
if (pasprintf(&portstr, "%d", 5900 + display) == -1)
- goto out;
+ goto out;
if (xs_write(xsh, XBT_NULL, buf, portstr, strlen(portstr)) == 0)
fprintf(logfile, "xs_write() vncport failed\n");
@@ -383,41 +385,41 @@ int xenstore_read_vncpasswd(int domid)
unsigned int i, len, rc = 0;
if (xsh == NULL) {
- return -1;
+ return -1;
}
path = xs_get_domain_path(xsh, domid);
if (path == NULL) {
- fprintf(logfile, "xs_get_domain_path() error. domid %d.\n", domid);
- return -1;
+ fprintf(logfile, "xs_get_domain_path() error. domid %d.\n", domid);
+ return -1;
}
pasprintf(&buf, "%s/vm", path);
uuid = xs_read(xsh, XBT_NULL, buf, &len);
if (uuid == NULL) {
- fprintf(logfile, "xs_read(): uuid get error. %s.\n", buf);
- free(path);
- return -1;
+ fprintf(logfile, "xs_read(): uuid get error. %s.\n", buf);
+ free(path);
+ return -1;
}
pasprintf(&buf, "%s/vncpasswd", uuid);
passwd = xs_read(xsh, XBT_NULL, buf, &len);
if (passwd == NULL) {
- fprintf(logfile, "xs_read(): vncpasswd get error. %s.\n", buf);
- free(uuid);
- free(path);
- return rc;
+ fprintf(logfile, "xs_read(): vncpasswd get error. %s.\n", buf);
+ free(uuid);
+ free(path);
+ return rc;
}
for (i=0; i<len && i<63; i++) {
- vncpasswd[i] = passwd[i];
- passwd[i] = '\0';
+ vncpasswd[i] = passwd[i];
+ passwd[i] = '\0';
}
vncpasswd[len] = '\0';
pasprintf(&buf, "%s/vncpasswd", uuid);
if (xs_write(xsh, XBT_NULL, buf, passwd, len) == 0) {
- fprintf(logfile, "xs_write() vncpasswd failed.\n");
- rc = -1;
+ fprintf(logfile, "xs_write() vncpasswd failed.\n");
+ rc = -1;
}
free(passwd);
@@ -443,7 +445,7 @@ char **xenstore_domain_get_devices(struc
goto out;
if (pasprintf(&buf, "%s/device/%s", path,devtype) == -1)
- goto out;
+ goto out;
e = xs_directory(handle, XBT_NULL, buf, num);
@@ -496,13 +498,13 @@ char *xenstore_backend_read_variable(str
buf = get_device_variable_path(devtype, inst, var);
if (NULL == buf)
- goto out;
+ goto out;
value = xs_read(handle, XBT_NULL, buf, &len);
free(buf);
-out:
+ out:
return value;
}
@@ -569,27 +571,27 @@ char *xenstore_vm_read(int domid, char *
char *buf = NULL, *path = NULL, *value = NULL;
if (xsh == NULL)
- goto out;
+ goto out;
path = xs_get_domain_path(xsh, domid);
if (path == NULL) {
- fprintf(logfile, "xs_get_domain_path(%d): error\n", domid);
- goto out;
+ fprintf(logfile, "xs_get_domain_path(%d): error\n", domid);
+ goto out;
}
pasprintf(&buf, "%s/vm", path);
free(path);
path = xs_read(xsh, XBT_NULL, buf, NULL);
if (path == NULL) {
- fprintf(logfile, "xs_read(%s): read error\n", buf);
- goto out;
+ fprintf(logfile, "xs_read(%s): read error\n", buf);
+ goto out;
}
pasprintf(&buf, "%s/%s", path, key);
value = xs_read(xsh, XBT_NULL, buf, len);
if (value == NULL) {
- fprintf(logfile, "xs_read(%s): read error\n", buf);
- goto out;
+ fprintf(logfile, "xs_read(%s): read error\n", buf);
+ goto out;
}
out:
@@ -604,27 +606,27 @@ int xenstore_vm_write(int domid, char *k
int rc = -1;
if (xsh == NULL)
- goto out;
+ goto out;
path = xs_get_domain_path(xsh, domid);
if (path == NULL) {
- fprintf(logfile, "xs_get_domain_path: error\n");
- goto out;
+ fprintf(logfile, "xs_get_domain_path: error\n");
+ goto out;
}
pasprintf(&buf, "%s/vm", path);
free(path);
path = xs_read(xsh, XBT_NULL, buf, NULL);
if (path == NULL) {
- fprintf(logfile, "xs_read(%s): read error\n", buf);
- goto out;
+ fprintf(logfile, "xs_read(%s): read error\n", buf);
+ goto out;
}
pasprintf(&buf, "%s/%s", path, key);
rc = xs_write(xsh, XBT_NULL, buf, value, strlen(value));
if (rc) {
- fprintf(logfile, "xs_write(%s, %s): write error\n", buf, key);
- goto out;
+ fprintf(logfile, "xs_write(%s, %s): write error\n", buf, key);
+ goto out;
}
out:
diff -r 3d356a2b1c75 -r db4fcb609383 tools/libfsimage/fat/fat.h
--- a/tools/libfsimage/fat/fat.h Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/libfsimage/fat/fat.h Wed Apr 11 15:45:29 2007 +0100
@@ -84,17 +84,17 @@ struct fat_bpb {
#define FAT_DIRENTRY_LENGTH 32
#define FAT_DIRENTRY_ATTRIB(entry) \
- (*((unsigned char *) (entry+11)))
+ (*((__u8 *) (entry+11)))
#define FAT_DIRENTRY_VALID(entry) \
- ( ((*((unsigned char *) entry)) != 0) \
- && ((*((unsigned char *) entry)) != 0xE5) \
+ ( ((*((__u8 *) entry)) != 0) \
+ && ((*((__u8 *) entry)) != 0xE5) \
&& !(FAT_DIRENTRY_ATTRIB(entry) & FAT_ATTRIB_NOT_OK_MASK) )
#define FAT_DIRENTRY_FIRST_CLUSTER(entry) \
- ((*((unsigned short *) (entry+26)))+(*((unsigned short *) (entry+20)) << 16))
+ ((*((__u16 *) (entry+26)))+(*((__u16 *) (entry+20)) << 16))
#define FAT_DIRENTRY_FILELENGTH(entry) \
- (*((unsigned long *) (entry+28)))
+ (*((__u32 *) (entry+28)))
#define FAT_LONGDIR_ID(entry) \
- (*((unsigned char *) (entry)))
+ (*((__u8 *) (entry)))
#define FAT_LONGDIR_ALIASCHECKSUM(entry) \
- (*((unsigned char *) (entry+13)))
+ (*((__u8 *) (entry+13)))
diff -r 3d356a2b1c75 -r db4fcb609383 tools/libxc/Makefile
--- a/tools/libxc/Makefile Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/libxc/Makefile Wed Apr 11 15:45:29 2007 +0100
@@ -26,8 +26,8 @@ CTRL_SRCS-$(CONFIG_X86_Linux) += xc_ptra
GUEST_SRCS-y :=
GUEST_SRCS-y += xg_private.c
-GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_linux_save.c
-GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_save.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
+GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
# symlink libelf from xen/common/libelf/
LIBELF_SRCS := libelf-tools.c libelf-loader.c
diff -r 3d356a2b1c75 -r db4fcb609383 tools/libxc/ia64/xc_ia64_linux_save.c
--- a/tools/libxc/ia64/xc_ia64_linux_save.c Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/libxc/ia64/xc_ia64_linux_save.c Wed Apr 11 15:45:29 2007 +0100
@@ -134,8 +134,10 @@ retry:
}
int
-xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(int))
+xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+ uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+ int hvm, void *(*init_qemu_maps)(int, unsigned),
+ void (*qemu_flip_buffer)(int, int))
{
DECLARE_DOMCTL;
xc_dominfo_t info;
diff -r 3d356a2b1c75 -r db4fcb609383 tools/libxc/xc_domain_save.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_domain_save.c Wed Apr 11 15:45:29 2007 +0100
@@ -0,0 +1,1609 @@
+/******************************************************************************
+ * xc_linux_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/params.h>
+#include <xen/hvm/e820.h>
+
+/*
+** Default values for important tuning parameters. Can override by passing
+** non-zero replacement values to xc_domain_save().
+**
+** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
+**
+*/
+#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
+#define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the current guest */
+static unsigned int pt_levels;
+
+/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* Live mapping of the table mapping each PFN to its current MFN. */
+static xen_pfn_t *live_p2m = NULL;
+
+/* Live mapping of system MFN to PFN table. */
+static xen_pfn_t *live_m2p = NULL;
+static unsigned long m2p_mfn0;
+
+/* grep fodder: machine_to_phys */
+
+#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
+
+/*
+ * Returns TRUE if the given machine frame number has a unique mapping
+ * in the guest's pseudophysical map.
+ */
+#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
+ (((_mfn) < (max_mfn)) && \
+ ((mfn_to_pfn(_mfn) < (p2m_size)) && \
+ (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
+
+/* Returns TRUE if MFN is successfully converted to a PFN. */
+#define translate_mfn_to_pfn(_pmfn) \
+({ \
+ unsigned long mfn = *(_pmfn); \
+ int _res = 1; \
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
+ _res = 0; \
+ else \
+ *(_pmfn) = mfn_to_pfn(mfn); \
+ _res; \
+})
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+ ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+ return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
+static inline unsigned int hweight32(unsigned int w)
+{
+ unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
+ res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+ res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
+ res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
+ return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
+}
+
+static inline int count_bits ( int nr, volatile void *addr)
+{
+ int i, count = 0;
+ volatile unsigned long *p = (volatile unsigned long *)addr;
+ /* We know that the array is padded to unsigned long. */
+ for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
+ count += hweight32(*p);
+ return count;
+}
+
+static inline int permute( int i, int nr, int order_nr )
+{
+ /* Need a simple permutation function so that we scan pages in a
+ pseudo random order, enabling us to get a better estimate of
+ the domain's page dirtying rate as we go (there are often
+ contiguous ranges of pfns that have similar behaviour, and we
+ want to mix them up. */
+
+ /* e.g. nr->oder 15->4 16->4 17->5 */
+ /* 512MB domain, 128k pages, order 17 */
+
+ /*
+ QPONMLKJIHGFEDCBA
+ QPONMLKJIH
+ GFEDCBA
+ */
+
+ /*
+ QPONMLKJIHGFEDCBA
+ EDCBA
+ QPONM
+ LKJIHGF
+ */
+
+ do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
+ while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
+
+ return i;
+}
+
+static uint64_t tv_to_us(struct timeval *new)
+{
+ return (new->tv_sec * 1000000) + new->tv_usec;
+}
+
+static uint64_t llgettimeofday(void)
+{
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ return tv_to_us(&now);
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+ return (((new->tv_sec - old->tv_sec)*1000000) +
+ (new->tv_usec - old->tv_usec));
+}
+
+static int noncached_write(int fd, int live, void *buffer, int len)
+{
+ static int write_count = 0;
+
+ int rc = write(fd,buffer,len);
+
+ write_count += len;
+ if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
+ {
+ /* Time to discard cache - dont care if this fails */
+ discard_file_cache(fd, 0 /* no flush */);
+ write_count = 0;
+ }
+
+ return rc;
+}
+
+#ifdef ADAPTIVE_SAVE
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate).
+*/
+
+#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
+#define START_MBIT_RATE 100 /* initial transmit rate for migrate */
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU 781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+/* We keep track of the current and previous transmission rate */
+static int mbit_rate, ombit_rate = 0;
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
+
+static inline void initialize_mbit_rate()
+{
+ mbit_rate = START_MBIT_RATE;
+}
+
+static int ratewrite(int io_fd, int live, void *buf, int n)
+{
+ static int budget = 0;
+ static int burst_time_us = -1;
+ static struct timeval last_put = { 0 };
+ struct timeval now;
+ struct timespec delay;
+ long long delta;
+
+ if ( START_MBIT_RATE == 0 )
+ return noncached_write(io_fd, live, buf, n);
+
+ budget -= n;
+ if ( budget < 0 )
+ {
+ if ( mbit_rate != ombit_rate )
+ {
+ burst_time_us = RATE_TO_BTU / mbit_rate;
+ ombit_rate = mbit_rate;
+ DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
+ mbit_rate, BURST_BUDGET, burst_time_us);
+ }
+ if ( last_put.tv_sec == 0 )
+ {
+ budget += BURST_BUDGET;
+ gettimeofday(&last_put, NULL);
+ }
+ else
+ {
+ while ( budget < 0 )
+ {
+ gettimeofday(&now, NULL);
+ delta = tv_delta(&now, &last_put);
+ while ( delta > burst_time_us )
+ {
+ budget += BURST_BUDGET;
+ last_put.tv_usec += burst_time_us;
+ if ( last_put.tv_usec > 1000000
+ {
+ last_put.tv_usec -= 1000000;
+ last_put.tv_sec++;
+ }
+ delta -= burst_time_us;
+ }
+ if ( budget > 0 )
+ break;
+ delay.tv_sec = 0;
+ delay.tv_nsec = 1000 * (burst_time_us - delta);
+ while ( delay.tv_nsec > 0 )
+ if ( nanosleep(&delay, &delay) == 0 )
+ break;
+ }
+ }
+ }
+ return noncached_write(io_fd, live, buf, n);
+}
+
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live),
(_buf), (_n))
+#define initialize_mbit_rate()
+
+#endif
+
+static inline ssize_t write_exact(int fd, void *buf, size_t count)
+{
+ return (write(fd, buf, count) == count);
+}
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+ xc_shadow_op_stats_t *stats, int print)
+{
+ static struct timeval wall_last;
+ static long long d0_cpu_last;
+ static long long d1_cpu_last;
+
+ struct timeval wall_now;
+ long long wall_delta;
+ long long d0_cpu_now, d0_cpu_delta;
+ long long d1_cpu_now, d1_cpu_delta;
+
+ gettimeofday(&wall_now, NULL);
+
+ d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+ d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+ if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+ DPRINTF("ARRHHH!!\n");
+
+ wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+ if ( wall_delta == 0 )
+ wall_delta = 1;
+
+ d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+ d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+ if ( print )
+ DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+ "dirtied %dMb/s %" PRId32 " pages\n",
+ wall_delta,
+ (int)((d0_cpu_delta*100)/wall_delta),
+ (int)((d1_cpu_delta*100)/wall_delta),
+ (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+ (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+ stats->dirty_count);
+
+#ifdef ADAPTIVE_SAVE
+ if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
+ {
+ mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
+ + 50;
+ if ( mbit_rate > MAX_MBIT_RATE )
+ mbit_rate = MAX_MBIT_RATE;
+ }
+#endif
+
+ d0_cpu_last = d0_cpu_now;
+ d1_cpu_last = d1_cpu_now;
+ wall_last = wall_now;
+
+ return 0;
+}
+
+
+static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
+ unsigned long *arr, int runs)
+{
+ long long start, now;
+ xc_shadow_op_stats_t stats;
+ int j;
+
+ start = llgettimeofday();
+
+ for ( j = 0; j < runs; j++ )
+ {
+ int i;
+
+ xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+ arr, p2m_size, NULL, 0, NULL);
+ DPRINTF("#Flush\n");
+ for ( i = 0; i < 40; i++ )
+ {
+ usleep(50000);
+ now = llgettimeofday();
+ xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
+ NULL, 0, NULL, 0, &stats);
+ DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
+ ((now-start)+500)/1000,
+ stats.fault_count, stats.dirty_count);
+ }
+ }
+
+ return -1;
+}
+
+
+static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+ int dom, xc_dominfo_t *info,
+ vcpu_guest_context_t *ctxt)
+{
+ int i = 0;
+
+ if ( !(*suspend)(dom) )
+ {
+ ERROR("Suspend request failed");
+ return -1;
+ }
+
+ retry:
+
+ if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
+ {
+ ERROR("Could not get domain info");
+ return -1;
+ }
+
+ if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
+ ERROR("Could not get vcpu context");
+
+
+ if ( info->dying )
+ {
+ ERROR("domain is dying");
+ return -1;
+ }
+
+ if ( info->crashed )
+ {
+ ERROR("domain has crashed");
+ return -1;
+ }
+
+ if ( info->shutdown )
+ {
+ switch ( info->shutdown_reason )
+ {
+ case SHUTDOWN_poweroff:
+ case SHUTDOWN_reboot:
+ ERROR("domain has shut down");
+ return -1;
+ case SHUTDOWN_suspend:
+ return 0;
+ case SHUTDOWN_crash:
+ ERROR("domain has crashed");
+ return -1;
+ }
+ }
+
+ if ( info->paused )
+ {
+ /* Try unpausing domain, wait, and retest. */
+ xc_domain_unpause( xc_handle, dom );
+ ERROR("Domain was paused. Wait and re-test.");
+ usleep(10000); /* 10ms */
+ goto retry;
+ }
+
+ if ( ++i < 100 )
+ {
+ ERROR("Retry suspend domain");
+ usleep(10000); /* 10ms */
+ goto retry;
+ }
+
+ ERROR("Unable to suspend domain.");
+
+ return -1;
+}
+
+/*
+** Map the top-level page of MFNs from the guest. The guest might not have
+** finished resuming from a previous restore operation, so we wait a while for
+** it to update the MFN to a reasonable value.
+*/
+static void *map_frame_list_list(int xc_handle, uint32_t dom,
+ shared_info_t *shinfo)
+{
+ int count = 100;
+ void *p;
+
+ while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
+ usleep(10000);
+
+ if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
+ {
+ ERROR("Timed out waiting for frame list updated.");
+ return NULL;
+ }
+
+ p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
+ shinfo->arch.pfn_to_mfn_frame_list_list);
+ if ( p == NULL )
+ ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
+
+ return p;
+}
+
+/*
+** During transfer (or in the state file), all page-table pages must be
+** converted into a 'canonical' form where references to actual mfns
+** are replaced with references to the corresponding pfns.
+**
+** This function performs the appropriate conversion, taking into account
+** which entries do not require canonicalization (in particular, those
+** entries which map the virtual address reserved for the hypervisor).
+*/
+static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
+ const void *spage, void *dpage)
+{
+
+ int i, pte_last, xen_start, xen_end, race = 0;
+ uint64_t pte;
+
+ /*
+ ** We need to determine which entries in this page table hold
+ ** reserved hypervisor mappings. This depends on the current
+ ** page table type as well as the number of paging levels.
+ */
+ xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
+
+ if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+ xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
+
+ if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
+ xen_start = L3_PAGETABLE_ENTRIES_PAE;
+
+ /*
+ ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
+ ** We can spot this by looking for the guest linear mapping which
+ ** Xen always ensures is present in that L2. Guests must ensure
+ ** that this check will fail for other L2s.
+ */
+ if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+ {
+ int hstart;
+ uint64_t he;
+
+ hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+ he = ((const uint64_t *) spage)[hstart];
+
+ if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+ {
+ /* hvirt starts with xen stuff... */
+ xen_start = hstart;
+ }
+ else if ( hvirt_start != 0xf5800000 )
+ {
+ /* old L2s from before hole was shrunk... */
+ hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+ he = ((const uint64_t *) spage)[hstart];
+ if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+ xen_start = hstart;
+ }
+ }
+
+ if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ /*
+ ** XXX SMH: should compute these from hvirt_start (which we have)
+ ** and hvirt_end (which we don't)
+ */
+ xen_start = 256;
+ xen_end = 272;
+ }
+
+ /* Now iterate through the page table, canonicalizing each PTE */
+ for (i = 0; i < pte_last; i++ )
+ {
+ unsigned long pfn, mfn;
+
+ if ( pt_levels == 2 )
+ pte = ((const uint32_t*)spage)[i];
+ else
+ pte = ((const uint64_t*)spage)[i];
+
+ if ( (i >= xen_start) && (i < xen_end) )
+ pte = 0;
+
+ if ( pte & _PAGE_PRESENT )
+ {
+ mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
+ {
+ /* This will happen if the type info is stale which
+ is quite feasible under live migration */
+ pfn = 0; /* zap it - we'll retransmit this page later */
+ race = 1; /* inform the caller of race; fatal if !live */
+ }
+ else
+ pfn = mfn_to_pfn(mfn);
+
+ pte &= ~MADDR_MASK_X86;
+ pte |= (uint64_t)pfn << PAGE_SHIFT;
+
+ /*
+ * PAE guest L3Es can contain these flags when running on
+ * a 64bit hypervisor. We zap these here to avoid any
+ * surprise at restore time...
+ */
+ if ( (pt_levels == 3) &&
+ (type == XEN_DOMCTL_PFINFO_L3TAB) &&
+ (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
+ pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
+ }
+
+ if ( pt_levels == 2 )
+ ((uint32_t*)dpage)[i] = pte;
+ else
+ ((uint64_t*)dpage)[i] = pte;
+ }
+
+ return race;
+}
+
+static xen_pfn_t *xc_map_m2p(int xc_handle,
+ unsigned long max_mfn,
+ int prot)
+{
+ struct xen_machphys_mfn_list xmml;
+ privcmd_mmap_entry_t *entries;
+ unsigned long m2p_chunks, m2p_size;
+ xen_pfn_t *m2p;
+ xen_pfn_t *extent_start;
+ int i, rc;
+
+ m2p_size = M2P_SIZE(max_mfn);
+ m2p_chunks = M2P_CHUNKS(max_mfn);
+
+ xmml.max_extents = m2p_chunks;
+ if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
+ {
+ ERROR("failed to allocate space for m2p mfns");
+ return NULL;
+ }
+ set_xen_guest_handle(xmml.extent_start, extent_start);
+
+ if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
+ (xmml.nr_extents != m2p_chunks) )
+ {
+ ERROR("xc_get_m2p_mfns");
+ return NULL;
+ }
+
+ if ( (m2p = mmap(NULL, m2p_size, prot,
+ MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
+ {
+ ERROR("failed to mmap m2p");
+ return NULL;
+ }
+
+ if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
+ {
+ ERROR("failed to allocate space for mmap entries");
+ return NULL;
+ }
+
+ for ( i = 0; i < m2p_chunks; i++ )
+ {
+ entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
+ entries[i].mfn = extent_start[i];
+ entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
+ }
+
+ if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
+ entries, m2p_chunks)) < 0 )
+ {
+ ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
+ return NULL;
+ }
+
+ m2p_mfn0 = entries[0].mfn;
+
+ free(extent_start);
+ free(entries);
+
+ return m2p;
+}
+
+
+static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
+ int io_fd,
+ uint32_t dom,
+ vcpu_guest_context_t *ctxt,
+ unsigned long p2m_size,
+ shared_info_t *live_shinfo)
+{
+ /* Double and single indirect references to the live P2M table */
+ xen_pfn_t *live_p2m_frame_list_list = NULL;
+ xen_pfn_t *live_p2m_frame_list = NULL;
+
+ /* A copy of the pfn-to-mfn table frame list. */
+ xen_pfn_t *p2m_frame_list = NULL;
+
+ /* The mapping of the live p2m table itself */
+ xen_pfn_t *p2m = NULL;
+
+ int i, success = 0;
+
+ live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
+ live_shinfo);
+ if ( !live_p2m_frame_list_list )
+ goto out;
+
+ live_p2m_frame_list =
+ xc_map_foreign_batch(xc_handle, dom, PROT_READ,
+ live_p2m_frame_list_list,
+ P2M_FLL_ENTRIES);
+ if ( !live_p2m_frame_list )
+ {
+ ERROR("Couldn't map p2m_frame_list");
+ goto out;
+ }
+
+
+ /* Map all the frames of the pfn->mfn table. For migrate to succeed,
+ the guest must not change which frames are used for this purpose.
+ (its not clear why it would want to change them, and we'll be OK
+ from a safety POV anyhow. */
+
+ p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
+ live_p2m_frame_list,
+ P2M_FL_ENTRIES);
+ if ( !p2m )
+ {
+ ERROR("Couldn't map p2m table");
+ goto out;
+ }
+ live_p2m = p2m; /* So that translation macros will work */
+
+ /* Get a local copy of the live_P2M_frame_list */
+ if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
+ {
+ ERROR("Couldn't allocate p2m_frame_list array");
+ goto out;
+ }
+ memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
+
+ /* Canonicalise the pfn-to-mfn table frame-number list. */
+ for ( i = 0; i < p2m_size; i += fpp )
+ {
+ if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
+ {
+ ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
+ ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
+ (uint64_t)p2m_frame_list[i/fpp]);
+ goto out;
+ }
+ }
+
+ /*
+ * Write an extended-info structure to inform the restore code that
+ * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
+ * slow paths in the restore code.
+ */
+ if ( (pt_levels == 3) &&
+ (ctxt->vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
+ {
+ unsigned long signature = ~0UL;
+ uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8;
+ uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
+ char chunk_sig[] = "vcpu";
+ if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
+ !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
+ !write_exact(io_fd, &chunk_sig, 4) ||
+ !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) ||
+ !write_exact(io_fd, ctxt, sizeof(*ctxt)) )
+ {
+ ERROR("write: extended info");
+ goto out;
+ }
+ }
+
+ if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
+ {
+ ERROR("write: p2m_frame_list");
+ goto out;
+ }
+
+ success = 1;
+
+ out:
+
+ if ( !success && p2m )
+ munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
+
+ if ( live_p2m_frame_list_list )
+ munmap(live_p2m_frame_list_list, PAGE_SIZE);
+
+ if ( live_p2m_frame_list )
+ munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
+
+ if ( p2m_frame_list )
+ free(p2m_frame_list);
+
+ return success ? p2m : NULL;
+}
+
+
+
+int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+ uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+ int hvm, void *(*init_qemu_maps)(int, unsigned),
+ void (*qemu_flip_buffer)(int, int))
+{
+ xc_dominfo_t info;
+
+ int rc = 1, i, j, last_iter, iter = 0;
+ int live = (flags & XCFLAGS_LIVE);
+ int debug = (flags & XCFLAGS_DEBUG);
+ int race = 0, sent_last_iter, skip_this_iter;
+
+ /* The new domain's shared-info frame number. */
+ unsigned long shared_info_frame;
+
+ /* A copy of the CPU context of the guest. */
+ vcpu_guest_context_t ctxt;
+
+ /* A table containing the type of each PFN (/not/ MFN!). */
+ unsigned long *pfn_type = NULL;
+ unsigned long *pfn_batch = NULL;
+
+ /* A copy of one frame of guest memory. */
+ char page[PAGE_SIZE];
+
+ /* Live mapping of shared info structure */
+ shared_info_t *live_shinfo = NULL;
+
+ /* base of the region in which domain memory is mapped */
+ unsigned char *region_base = NULL;
+
+ /* power of 2 order of p2m_size */
+ int order_nr;
+
+ /* bitmap of pages:
+ - that should be sent this iteration (unless later marked as skip);
+ - to skip this iteration because already dirty;
+ - to fixup by sending at the end if not already resent; */
+ unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
+
+ xc_shadow_op_stats_t stats;
+
+ unsigned long needed_to_fix = 0;
+ unsigned long total_sent = 0;
+
+ uint64_t vcpumap = 1ULL;
+
+ /* HVM: a buffer for holding HVM context */
+ uint32_t hvm_buf_size = 0;
+ uint8_t *hvm_buf = NULL;
+
+ /* HVM: magic frames for ioreqs and xenstore comms. */
+ uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+ /* If no explicit control parameters given, use defaults */
+ max_iters = max_iters ? : DEF_MAX_ITERS;
+ max_factor = max_factor ? : DEF_MAX_FACTOR;
+
+ initialize_mbit_rate();
+
+ if ( !get_platform_info(xc_handle, dom,
+ &max_mfn, &hvirt_start, &pt_levels) )
+ {
+ ERROR("Unable to get platform info.");
+ return 1;
+ }
+
+ if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
+ {
+ ERROR("Could not get domain info");
+ return 1;
+ }
+
+ if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
+ {
+ ERROR("Could not get vcpu context");
+ goto out;
+ }
+ shared_info_frame = info.shared_info_frame;
+
+ /* Map the shared info frame */
+ if ( !hvm )
+ {
+ live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+ PROT_READ, shared_info_frame);
+ if ( !live_shinfo )
+ {
+ ERROR("Couldn't map live_shinfo");
+ goto out;
+ }
+ }
+
+ /* Get the size of the P2M table */
+ p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
+
+ /* Domain is still running at this point */
+ if ( live )
+ {
+ /* Live suspend. Enable log-dirty mode. */
+ if ( xc_shadow_control(xc_handle, dom,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL) < 0 )
+ {
+ ERROR("Couldn't enable shadow mode");
+ goto out;
+ }
+
+ if ( hvm )
+ {
+ /* Get qemu-dm logging dirty pages too */
+ void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+ qemu_bitmaps[0] = seg;
+ qemu_bitmaps[1] = seg + BITMAP_SIZE;
+ qemu_active = 0;
+ qemu_non_active = 1;
+ }
+ }
+ else
+ {
+ /* This is a non-live suspend. Suspend the domain .*/
+ if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
+ {
+ ERROR("Domain appears not to have suspended");
+ goto out;
+ }
+ }
+
+ last_iter = !live;
+
+ /* pretend we sent all the pages last iteration */
+ sent_last_iter = p2m_size;
+
+ /* calculate the power of 2 order of p2m_size, e.g.
+ 15->4 16->4 17->5 */
+ for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
+ continue;
+
+ /* Setup to_send / to_fix and to_skip bitmaps */
+ to_send = malloc(BITMAP_SIZE);
+ to_fix = calloc(1, BITMAP_SIZE);
+ to_skip = malloc(BITMAP_SIZE);
+
+ if ( !to_send || !to_fix || !to_skip )
+ {
+ ERROR("Couldn't allocate to_send array");
+ goto out;
+ }
+
+ memset(to_send, 0xff, BITMAP_SIZE);
+
+ if ( lock_pages(to_send, BITMAP_SIZE) )
+ {
+ ERROR("Unable to lock to_send");
+ return 1;
+ }
+
+ /* (to fix is local only) */
+ if ( lock_pages(to_skip, BITMAP_SIZE) )
+ {
+ ERROR("Unable to lock to_skip");
+ return 1;
+ }
+
+ if ( hvm )
+ {
+ /* Need another buffer for HVM context */
+ hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
+ if ( hvm_buf_size == -1 )
+ {
+ ERROR("Couldn't get HVM context size from Xen");
+ goto out;
+ }
+ hvm_buf = malloc(hvm_buf_size);
+ if ( !hvm_buf )
+ {
+ ERROR("Couldn't allocate memory");
+ goto out;
+ }
+ }
+
+ analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
+
+ /* We want zeroed memory so use calloc rather than malloc. */
+ pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
+ pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
+ if ( (pfn_type == NULL) || (pfn_batch == NULL) )
+ {
+ ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
+ errno = ENOMEM;
+ goto out;
+ }
+
+ if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
+ {
+ ERROR("Unable to lock");
+ goto out;
+ }
+
+ /* Setup the mfn_to_pfn table mapping */
+ if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
+ {
+ ERROR("Failed to map live M2P table");
+ goto out;
+ }
+
+ /* Start writing out the saved-domain record. */
+ if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+ {
+ ERROR("write: p2m_size");
+ goto out;
+ }
+
+ if ( !hvm )
+ {
+ int err = 0;
+ unsigned long mfn;
+
+ /* Map the P2M table, and write the list of P2M frames */
+ live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom,
+ &ctxt, p2m_size, live_shinfo);
+ if ( live_p2m == NULL )
+ {
+ ERROR("Failed to map/save the p2m frame list");
+ goto out;
+ }
+
+ /*
+ * Quick belt and braces sanity check.
+ */
+
+ for ( i = 0; i < p2m_size; i++ )
+ {
+ mfn = live_p2m[i];
+ if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
+ {
+ DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
+ mfn, mfn_to_pfn(mfn));
+ err++;
+ }
+ }
+ DPRINTF("Had %d unexplained entries in p2m table\n", err);
+ }
+
+ print_stats(xc_handle, dom, 0, &stats, 0);
+
+ /* Now write out each data page, canonicalising page tables as we go... */
+ for ( ; ; )
+ {
+ unsigned int prev_pc, sent_this_iter, N, batch;
+
+ iter++;
+ sent_this_iter = 0;
+ skip_this_iter = 0;
+ prev_pc = 0;
+ N = 0;
+
+ DPRINTF("Saving memory pages: iter %d 0%%", iter);
+
+ while ( N < p2m_size )
+ {
+ unsigned int this_pc = (N * 100) / p2m_size;
+ int rc;
+
+ if ( (this_pc - prev_pc) >= 5 )
+ {
+ DPRINTF("\b\b\b\b%3d%%", this_pc);
+ prev_pc = this_pc;
+ }
+
+ if ( !last_iter )
+ {
+ /* Slightly wasteful to peek the whole array evey time,
+ but this is fast enough for the moment. */
+ rc = xc_shadow_control(
+ xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
+ p2m_size, NULL, 0, NULL);
+ if ( rc != p2m_size )
+ {
+ ERROR("Error peeking shadow bitmap");
+ goto out;
+ }
+ }
+
+ /* load pfn_type[] with the mfn of all the pages we're doing in
+ this batch. */
+ for ( batch = 0;
+ (batch < MAX_BATCH_SIZE) && (N < p2m_size);
+ N++ )
+ {
+ int n = permute(N, p2m_size, order_nr);
+
+ if ( debug )
+ DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
+ iter, (unsigned long)n, hvm ? 0 : live_p2m[n],
+ test_bit(n, to_send),
+ hvm ? 0 : mfn_to_pfn(live_p2m[n]&0xFFFFF));
+
+ if ( !last_iter &&
+ test_bit(n, to_send) &&
+ test_bit(n, to_skip) )
+ skip_this_iter++; /* stats keeping */
+
+ if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
+ (test_bit(n, to_send) && last_iter) ||
+ (test_bit(n, to_fix) && last_iter)) )
+ continue;
+
+ /* Skip PFNs that aren't really there */
+ if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
+ || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
+ && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
+ continue;
+
+ /*
+ ** we get here if:
+ ** 1. page is marked to_send & hasn't already been re-dirtied
+ ** 2. (ignore to_skip in last iteration)
+ ** 3. add in pages that still need fixup (net bufs)
+ */
+
+ pfn_batch[batch] = n;
+
+ /* Hypercall interfaces operate in PFNs for HVM guests
+ * and MFNs for PV guests */
+ if ( hvm )
+ pfn_type[batch] = n;
+ else
+ pfn_type[batch] = live_p2m[n];
+
+ if ( !is_mapped(pfn_type[batch]) )
+ {
+ /*
+ ** not currently in psuedo-physical map -- set bit
+ ** in to_fix since we must send this page in last_iter
+ ** unless its sent sooner anyhow, or it never enters
+ ** pseudo-physical map (e.g. for ballooned down doms)
+ */
+ set_bit(n, to_fix);
+ continue;
+ }
+
+ if ( last_iter &&
+ test_bit(n, to_fix) &&
+ !test_bit(n, to_send) )
+ {
+ needed_to_fix++;
+ DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+ iter, n, pfn_type[batch]);
+ }
+
+ clear_bit(n, to_fix);
+
+ batch++;
+ }
+
+ if ( batch == 0 )
+ goto skip; /* vanishingly unlikely... */
+
+ region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_READ, pfn_type, batch);
+ if ( region_base == NULL )
+ {
+ ERROR("map batch failed");
+ goto out;
+ }
+
+ if ( !hvm )
+ {
+ /* Get page types */
+ for ( j = 0; j < batch; j++ )
+ ((uint32_t *)pfn_type)[j] = pfn_type[j];
+ if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
+ (uint32_t *)pfn_type) )
+ {
+ ERROR("get_pfn_type_batch failed");
+ goto out;
+ }
+ for ( j = batch-1; j >= 0; j-- )
+ pfn_type[j] = ((uint32_t *)pfn_type)[j];
+
+ for ( j = 0; j < batch; j++ )
+ {
+
+ if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
+ XEN_DOMCTL_PFINFO_XTAB )
+ {
+ DPRINTF("type fail: page %i mfn %08lx\n",
+ j, pfn_type[j]);
+ continue;
+ }
+
+ if ( debug )
+ DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
+ " sum= %08lx\n",
+ iter,
+ (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
+ pfn_batch[j],
+ pfn_type[j],
+ mfn_to_pfn(pfn_type[j] &
+ ~XEN_DOMCTL_PFINFO_LTAB_MASK),
+ csum_page(region_base + (PAGE_SIZE*j)));
+
+ /* canonicalise mfn->pfn */
+ pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
+ pfn_batch[j];
+ }
+ }
+
+ if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
+ {
+ ERROR("Error when writing to state file (2) (errno %d)",
+ errno);
+ goto out;
+ }
+
+ if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+ {
+ ERROR("Error when writing to state file (3) (errno %d)",
+ errno);
+ goto out;
+ }
+
+ /* entering this loop, pfn_type is now in pfns (Not mfns) */
+ for ( j = 0; j < batch; j++ )
+ {
+ unsigned long pfn, pagetype;
+ void *spage = (char *)region_base + (PAGE_SIZE*j);
+
+ pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ /* write out pages in batch */
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ continue;
+
+ pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ /* We have a pagetable page: need to rewrite it. */
+ race =
+ canonicalize_pagetable(pagetype, pfn, spage, page);
+
+ if ( race && !live )
+ {
+ ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
+ pagetype);
+ goto out;
+ }
+
+ if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
+ {
+ ERROR("Error when writing to state file (4)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+ else
+ {
+ /* We have a normal page: just write it directly. */
+ if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
+ PAGE_SIZE )
+ {
+ ERROR("Error when writing to state file (5)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+ } /* end of the write out for this batch */
+
+ sent_this_iter += batch;
+
+ munmap(region_base, batch*PAGE_SIZE);
+
+ } /* end of this while loop for this iteration */
+
+ skip:
+
+ total_sent += sent_this_iter;
+
+ DPRINTF("\r %d: sent %d, skipped %d, ",
+ iter, sent_this_iter, skip_this_iter );
+
+ if ( last_iter )
+ {
+ print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+ DPRINTF("Total pages sent= %ld (%.2fx)\n",
+ total_sent, ((float)total_sent)/p2m_size );
+ DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
+ }
+
+ if ( last_iter && debug )
+ {
+ int minusone = -1;
+ memset(to_send, 0xff, BITMAP_SIZE);
+ debug = 0;
+ DPRINTF("Entering debug resend-all mode\n");
+
+ /* send "-1" to put receiver into debug mode */
+ if ( !write_exact(io_fd, &minusone, sizeof(int)) )
+ {
+ ERROR("Error when writing to state file (6) (errno %d)",
+ errno);
+ goto out;
+ }
+
+ continue;
+ }
+
+ if ( last_iter )
+ break;
+
+ if ( live )
+ {
+ if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
+ (iter >= max_iters) ||
+ (sent_this_iter+skip_this_iter < 50) ||
+ (total_sent > p2m_size*max_factor) )
+ {
+ DPRINTF("Start last iteration\n");
+ last_iter = 1;
+
+ if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
+ &ctxt) )
+ {
+ ERROR("Domain appears not to have suspended");
+ goto out;
+ }
+
+ DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
+ info.shared_info_frame,
+ (unsigned long)ctxt.user_regs.eip,
+ (unsigned long)ctxt.user_regs.edx);
+ }
+
+ if ( xc_shadow_control(xc_handle, dom,
+ XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
+ p2m_size, NULL, 0, &stats) != p2m_size )
+ {
+ ERROR("Error flushing shadow PT");
+ goto out;
+ }
+
+ if ( hvm )
+ {
+ /* Pull in the dirty bits from qemu-dm too */
+ if ( !last_iter )
+ {
+ qemu_active = qemu_non_active;
+ qemu_non_active = qemu_active ? 0 : 1;
+ qemu_flip_buffer(dom, qemu_active);
+ for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
+ {
+ to_send[j] |= qemu_bitmaps[qemu_non_active][j];
+ qemu_bitmaps[qemu_non_active][j] = 0;
+ }
+ }
+ else
+ {
+ for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
+ to_send[j] |= qemu_bitmaps[qemu_active][j];
+ }
+ }
+
+ sent_last_iter = sent_this_iter;
+
+ print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
+
+ }
+ } /* end of infinite for loop */
+
+ DPRINTF("All memory is saved\n");
+
+ {
+ struct {
+ int minustwo;
+ int max_vcpu_id;
+ uint64_t vcpumap;
+ } chunk = { -2, info.max_vcpu_id };
+
+ if ( info.max_vcpu_id >= 64 )
+ {
+ ERROR("Too many VCPUS in guest!");
+ goto out;
+ }
+
+ for ( i = 1; i <= info.max_vcpu_id; i++ )
+ {
+ xc_vcpuinfo_t vinfo;
+ if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
+ vinfo.online )
+ vcpumap |= 1ULL << i;
+ }
+
+ chunk.vcpumap = vcpumap;
+ if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ ERROR("Error when writing to state file (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ /* Zero terminate */
+ i = 0;
+ if ( !write_exact(io_fd, &i, sizeof(int)) )
+ {
+ ERROR("Error when writing to state file (6') (errno %d)", errno);
+ goto out;
+ }
+
+ if ( hvm )
+ {
+ uint32_t rec_size;
+
+ /* Save magic-page locations. */
+ memset(magic_pfns, 0, sizeof(magic_pfns));
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+ (unsigned long *)&magic_pfns[0]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+ (unsigned long *)&magic_pfns[1]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+ (unsigned long *)&magic_pfns[2]);
+ if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+ {
+ ERROR("Error when writing to state file (7)");
+ goto out;
+ }
+
+ /* Save vcpu contexts */
+
+ for ( i = 0; i <= info.max_vcpu_id; i++ )
+ {
+ if ( !(vcpumap & (1ULL << i)) )
+ continue;
+
+ if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
+ {
+ ERROR("HVM:Could not get vcpu context");
+ goto out;
+ }
+
+ DPRINTF("write vcpu %d context.\n", i);
+ if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
+ {
+ ERROR("write vcpu context failed!\n");
+ goto out;
+ }
+ }
+
+ /* Get HVM context from Xen and save it too */
+ if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
+ hvm_buf_size)) == -1 )
+ {
+ ERROR("HVM:Could not get hvm buffer");
+ goto out;
+ }
+
+ if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+ {
+ ERROR("error write hvm buffer size");
+ goto out;
+ }
+
+ if ( !write_exact(io_fd, hvm_buf, rec_size) )
+ {
+ ERROR("write HVM info failed!\n");
+ goto out;
+ }
+
+ /* HVM guests are done now */
+ rc = 0;
+ goto out;
+ }
+
+ /* PV guests only from now on */
+
+ /* Send through a list of all the PFNs that were not in map at the close */
+ {
+ unsigned int i,j;
+ unsigned long pfntab[1024];
+
+ for ( i = 0, j = 0; i < p2m_size; i++ )
+ {
+ if ( !is_mapped(live_p2m[i]) )
+ j++;
+ }
+
+ if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
+ {
+ ERROR("Error when writing to state file (6a) (errno %d)", errno);
+ goto out;
+ }
+
+ for ( i = 0, j = 0; i < p2m_size; )
+ {
+ if ( !is_mapped(live_p2m[i]) )
+ pfntab[j++] = i;
+
+ i++;
+ if ( (j == 1024) || (i == p2m_size) )
+ {
+ if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
+ {
+ ERROR("Error when writing to state file (6b) (errno %d)",
+ errno);
+ goto out;
+ }
+ j = 0;
+ }
+ }
+ }
+
+ /* Canonicalise the suspend-record frame number. */
+ if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
+ {
+ ERROR("Suspend record is not in range of pseudophys map");
+ goto out;
+ }
+
+ for ( i = 0; i <= info.max_vcpu_id; i++ )
+ {
+ if ( !(vcpumap & (1ULL << i)) )
+ continue;
+
+ if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
+ {
+ ERROR("No context for VCPU%d", i);
+ goto out;
+ }
+
+ /* Canonicalise each GDT frame number. */
+ for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
+ {
+ if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
+ {
+ ERROR("GDT frame is not in range of pseudophys map");
+ goto out;
+ }
+ }
+
+ /* Canonicalise the page table base pointer. */
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
+ {
+ ERROR("PT base is not in range of pseudophys map");
+ goto out;
+ }
+ ctxt.ctrlreg[3] =
+ xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
+
+ /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
+ if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
+ {
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
+ {
+ ERROR("PT base is not in range of pseudophys map");
+ goto out;
+ }
+ /* Least-significant bit means 'valid PFN'. */
+ ctxt.ctrlreg[1] = 1 |
+ xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
+ }
+
+ if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
+ {
+ ERROR("Error when writing to state file (1) (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ /*
+ * Reset the MFN to be a known-invalid value. See map_frame_list_list().
+ */
+ memcpy(page, live_shinfo, PAGE_SIZE);
+ ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
+ if ( !write_exact(io_fd, page, PAGE_SIZE) )
+ {
+ ERROR("Error when writing to state file (1) (errno %d)", errno);
+ goto out;
+ }
+
+ /* Success! */
+ rc = 0;
+
+ out:
+
+ if ( live )
+ {
+ if ( xc_shadow_control(xc_handle, dom,
+ XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL) < 0 )
+ DPRINTF("Warning - couldn't disable shadow mode");
+ }
+
+ /* Flush last write and discard cache for file. */
+ discard_file_cache(io_fd, 1 /* flush */);
+
+ if ( live_shinfo )
+ munmap(live_shinfo, PAGE_SIZE);
+
+ if ( live_p2m )
+ munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
+
+ if ( live_m2p )
+ munmap(live_m2p, M2P_SIZE(max_mfn));
+
+ free(pfn_type);
+ free(pfn_batch);
+ free(to_send);
+ free(to_fix);
+ free(to_skip);
+
+ DPRINTF("Save exit rc=%d\n",rc);
+
+ return !!rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 3d356a2b1c75 -r db4fcb609383 tools/libxc/xc_hvm_save.c
--- a/tools/libxc/xc_hvm_save.c Wed Apr 11 07:30:02 2007 -0600
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,755 +0,0 @@
-/******************************************************************************
- * xc_hvm_save.c
- *
- * Save the state of a running HVM guest.
- *
- * Copyright (c) 2003, K A Fraser.
- * Copyright (c) 2006 Intel Corperation
- * rewriten for hvm guest by Zhai Edwin <edwin.zhai@xxxxxxxxx>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- */
-
-#include <inttypes.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/time.h>
-
-#include "xc_private.h"
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-#include <xen/hvm/e820.h>
-#include <xen/hvm/params.h>
-
-/*
-** Default values for important tuning parameters. Can override by passing
-** non-zero replacement values to xc_hvm_save().
-**
-** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
-**
-*/
-#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
-#define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
-
-/* Shared-memory bitmaps for getting log-dirty bits from qemu */
-static unsigned long *qemu_bitmaps[2];
-static int qemu_active;
-static int qemu_non_active;
-
-/*
-** During (live) save/migrate, we maintain a number of bitmaps to track
-** which pages we have to send, to fixup, and to skip.
-*/
-
-#define BITS_PER_LONG (sizeof(unsigned long) * 8)
-#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
-#define BITMAP_SIZE (BITS_TO_LONGS(pfn_array_size) * sizeof(unsigned long))
-
-#define BITMAP_ENTRY(_nr,_bmap) \
- ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
-
-#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
-
-static inline int test_bit (int nr, volatile void * addr)
-{
- return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
-}
-
-static inline void clear_bit (int nr, volatile void * addr)
-{
- BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
-}
-
-static inline int permute( int i, int nr, int order_nr )
-{
- /* Need a simple permutation function so that we scan pages in a
- pseudo random order, enabling us to get a better estimate of
- the domain's page dirtying rate as we go (there are often
- contiguous ranges of pfns that have similar behaviour, and we
- want to mix them up. */
-
- /* e.g. nr->oder 15->4 16->4 17->5 */
- /* 512MB domain, 128k pages, order 17 */
-
- /*
- QPONMLKJIHGFEDCBA
- QPONMLKJIH
- GFEDCBA
- */
-
- /*
- QPONMLKJIHGFEDCBA
- EDCBA
- QPONM
- LKJIHGF
- */
-
- do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
- while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
-
- return i;
-}
-
-
-static uint64_t tv_to_us(struct timeval *new)
-{
- return (new->tv_sec * 1000000) + new->tv_usec;
-}
-
-static uint64_t llgettimeofday(void)
-{
- struct timeval now;
- gettimeofday(&now, NULL);
- return tv_to_us(&now);
-}
-
-static uint64_t tv_delta(struct timeval *new, struct timeval *old)
-{
- return (((new->tv_sec - old->tv_sec)*1000000) +
- (new->tv_usec - old->tv_usec));
-}
-
-
-#define RATE_IS_MAX() (0)
-#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
-#define initialize_mbit_rate()
-
-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
- return (write(fd, buf, count) == count);
-}
-
-static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
- xc_shadow_op_stats_t *stats, int print)
-{
- static struct timeval wall_last;
- static long long d0_cpu_last;
- static long long d1_cpu_last;
-
- struct timeval wall_now;
- long long wall_delta;
- long long d0_cpu_now, d0_cpu_delta;
- long long d1_cpu_now, d1_cpu_delta;
-
- gettimeofday(&wall_now, NULL);
-
- d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
- d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
-
- if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
- DPRINTF("ARRHHH!!\n");
-
- wall_delta = tv_delta(&wall_now,&wall_last)/1000;
- if ( wall_delta == 0 )
- wall_delta = 1;
-
- d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
- d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
-
- if ( print )
- DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
- "dirtied %dMb/s %" PRId32 " pages\n",
- wall_delta,
- (int)((d0_cpu_delta*100)/wall_delta),
- (int)((d1_cpu_delta*100)/wall_delta),
- (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
- (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
- stats->dirty_count);
-
- d0_cpu_last = d0_cpu_now;
- d1_cpu_last = d1_cpu_now;
- wall_last = wall_now;
-
- return 0;
-}
-
-static int analysis_phase(int xc_handle, uint32_t domid, int pfn_array_size,
- unsigned long *arr, int runs)
-{
- long long start, now;
- xc_shadow_op_stats_t stats;
- int j;
-
- start = llgettimeofday();
-
- for ( j = 0; j < runs; j++ )
- {
- int i;
-
- xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
- arr, pfn_array_size, NULL, 0, NULL);
- DPRINTF("#Flush\n");
- for ( i = 0; i < 40; i++ )
- {
- usleep(50000);
- now = llgettimeofday();
- xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
- NULL, 0, NULL, 0, &stats);
- DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
- ((now-start)+500)/1000,
- stats.fault_count, stats.dirty_count);
- }
- }
-
- return -1;
-}
-
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
- int dom, xc_dominfo_t *info,
- vcpu_guest_context_t *ctxt)
-{
- int i = 0;
-
- if ( !(*suspend)(dom) )
- {
- ERROR("Suspend request failed");
- return -1;
- }
-
- retry:
-
- if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
- {
- ERROR("Could not get domain info");
- return -1;
- }
-
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
- ERROR("Could not get vcpu context");
-
- if ( info->shutdown && (info->shutdown_reason == SHUTDOWN_suspend) )
- return 0; /* success */
-
- if ( info->paused )
- {
- /* Try unpausing domain, wait, and retest. */
- xc_domain_unpause( xc_handle, dom );
- ERROR("Domain was paused. Wait and re-test.");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- if ( ++i < 100 )
- {
- ERROR("Retry suspend domain.");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- ERROR("Unable to suspend domain.");
-
- return -1;
-}
-
-int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(int),
- void *(*init_qemu_maps)(int, unsigned),
- void (*qemu_flip_buffer)(int, int))
-{
- xc_dominfo_t info;
-
- int rc = 1, i, j, last_iter, iter = 0;
- int live = !!(flags & XCFLAGS_LIVE);
- int debug = !!(flags & XCFLAGS_DEBUG);
- int sent_last_iter, skip_this_iter;
-
- /* The highest guest-physical frame number used by the current guest */
- unsigned long max_pfn;
-
- /* The size of an array big enough to contain all guest pfns */
- unsigned long pfn_array_size;
-
- /* Magic frames: ioreqs and xenstore comms. */
- uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
-
- /* A copy of the CPU context of the guest. */
- vcpu_guest_context_t ctxt;
-
- /* A table containg the PFNs (/not/ MFN!) to map. */
- xen_pfn_t *pfn_batch = NULL;
-
- /* A copy of hvm domain context buffer*/
- uint32_t hvm_buf_size;
- uint8_t *hvm_buf = NULL;
-
- /* base of the region in which domain memory is mapped */
- unsigned char *region_base = NULL;
-
- uint32_t rec_size, nr_vcpus;
-
- /* power of 2 order of pfn_array_size */
- int order_nr;
-
- /* bitmap of pages:
- - that should be sent this iteration (unless later marked as skip);
- - to skip this iteration because already dirty; */
- unsigned long *to_send = NULL, *to_skip = NULL;
-
- xc_shadow_op_stats_t stats;
-
- unsigned long total_sent = 0;
-
- uint64_t vcpumap = 1ULL;
-
- DPRINTF("xc_hvm_save: dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, "
- "live=%d, debug=%d.\n", dom, max_iters, max_factor, flags,
- live, debug);
-
- /* If no explicit control parameters given, use defaults */
- max_iters = max_iters ? : DEF_MAX_ITERS;
- max_factor = max_factor ? : DEF_MAX_FACTOR;
-
- initialize_mbit_rate();
-
- if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
- {
- ERROR("HVM: Could not get domain info");
- return 1;
- }
- nr_vcpus = info.nr_online_vcpus;
-
- if ( mlock(&ctxt, sizeof(ctxt)) )
- {
- ERROR("HVM: Unable to mlock ctxt");
- return 1;
- }
-
- /* Only have to worry about vcpu 0 even for SMP */
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
- {
- ERROR("HVM: Could not get vcpu context");
- goto out;
- }
-
- DPRINTF("saved hvm domain info: max_memkb=0x%lx, nr_pages=0x%lx\n",
- info.max_memkb, info.nr_pages);
-
- if ( live )
- {
- /* Live suspend. Enable log-dirty mode. */
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL, 0, NULL) < 0 )
- {
- ERROR("Couldn't enable shadow mode");
- goto out;
- }
- }
- else
- {
- /* This is a non-live suspend. Suspend the domain .*/
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
- {
- ERROR("HVM Domain appears not to have suspended");
- goto out;
- }
- }
-
- last_iter = !live;
-
- max_pfn = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
-
- DPRINTF("after 1st handle hvm domain max_pfn=0x%lx, "
- "max_memkb=0x%lx, live=%d.\n",
- max_pfn, info.max_memkb, live);
-
- /* Size of any array that covers 0 ... max_pfn */
- pfn_array_size = max_pfn + 1;
- if ( !write_exact(io_fd, &pfn_array_size, sizeof(unsigned long)) )
- {
- ERROR("Error when writing to state file (1)");
- goto out;
- }
-
- /* pretend we sent all the pages last iteration */
- sent_last_iter = pfn_array_size;
-
- /* calculate the power of 2 order of pfn_array_size, e.g.
- 15->4 16->4 17->5 */
- for ( i = pfn_array_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
- continue;
-
- /* Setup to_send / to_fix and to_skip bitmaps */
- to_send = malloc(BITMAP_SIZE);
- to_skip = malloc(BITMAP_SIZE);
-
- if ( live )
- {
- /* Get qemu-dm logging dirty pages too */
- void *seg = init_qemu_maps(dom, BITMAP_SIZE);
- qemu_bitmaps[0] = seg;
- qemu_bitmaps[1] = seg + BITMAP_SIZE;
- qemu_active = 0;
- qemu_non_active = 1;
- }
-
- hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
- if ( hvm_buf_size == -1 )
- {
- ERROR("Couldn't get HVM context size from Xen");
- goto out;
- }
- hvm_buf = malloc(hvm_buf_size);
-
- if ( !to_send || !to_skip || !hvm_buf )
- {
- ERROR("Couldn't allocate memory");
- goto out;
- }
-
- memset(to_send, 0xff, BITMAP_SIZE);
-
- if ( lock_pages(to_send, BITMAP_SIZE) )
- {
- ERROR("Unable to lock to_send");
- return 1;
- }
-
- /* (to fix is local only) */
- if ( lock_pages(to_skip, BITMAP_SIZE) )
- {
- ERROR("Unable to lock to_skip");
- return 1;
- }
-
- analysis_phase(xc_handle, dom, pfn_array_size, to_skip, 0);
-
- /* We want zeroed memory so use calloc rather than malloc. */
- pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
- if ( pfn_batch == NULL )
- {
- ERROR("failed to alloc memory for pfn_batch array");
- errno = ENOMEM;
- goto out;
- }
-
- for ( ; ; )
- {
- unsigned int prev_pc, sent_this_iter, N, batch;
-
- iter++;
- sent_this_iter = 0;
- skip_this_iter = 0;
- prev_pc = 0;
- N=0;
-
- DPRINTF("Saving memory pages: iter %d 0%%", iter);
-
- while ( N < pfn_array_size )
- {
- unsigned int this_pc = (N * 100) / pfn_array_size;
- int rc;
-
- if ( (this_pc - prev_pc) >= 5 )
- {
- DPRINTF("\b\b\b\b%3d%%", this_pc);
- prev_pc = this_pc;
- }
-
- if ( !last_iter )
- {
- /* Slightly wasteful to peek the whole array evey time,
- but this is fast enough for the moment. */
- rc = xc_shadow_control(
- xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
- pfn_array_size, NULL, 0, NULL);
- if ( rc != pfn_array_size )
- {
- ERROR("Error peeking shadow bitmap");
- goto out;
- }
- }
-
- /* load pfn_batch[] with the mfn of all the pages we're doing in
- this batch. */
- for ( batch = 0;
- (batch < MAX_BATCH_SIZE) && (N < pfn_array_size);
- N++ )
- {
- int n = permute(N, pfn_array_size, order_nr);
-
- if ( 0 && debug )
- DPRINTF("%d pfn= %08lx %d \n",
- iter, (unsigned long)n, test_bit(n, to_send));
-
- if ( !last_iter &&
- test_bit(n, to_send) &&
- test_bit(n, to_skip) )
- skip_this_iter++; /* stats keeping */
-
- if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
- (test_bit(n, to_send) && last_iter)) )
- continue;
-
- /* Skip PFNs that aren't really there */
- if ( (n >= 0xa0 && n < 0xc0) /* VGA hole */
- || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) &&
- n < (1ULL << 32) >> PAGE_SHIFT) /* 4G MMIO hole */ )
- continue;
-
- /*
- ** we get here if:
- ** 1. page is marked to_send & hasn't already been re-dirtied
- ** 2. (ignore to_skip in last iteration)
- */
-
- pfn_batch[batch] = n;
-
- batch++;
- }
-
- if ( batch == 0 )
- goto skip; /* vanishingly unlikely... */
-
- region_base = xc_map_foreign_batch(
- xc_handle, dom, PROT_READ, pfn_batch, batch);
- if ( region_base == 0 )
- {
- ERROR("map batch failed");
- goto out;
- }
-
- /* write num of pfns */
- if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
- {
- ERROR("Error when writing to state file (2)");
- goto out;
- }
-
- /* write all the pfns */
- if ( !write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch) )
- {
- ERROR("Error when writing to state file (3)");
- goto out;
- }
-
- for ( j = 0; j < batch; j++ )
- {
- if ( pfn_batch[j] & XEN_DOMCTL_PFINFO_LTAB_MASK )
- continue;
- if ( ratewrite(io_fd, region_base + j*PAGE_SIZE,
- PAGE_SIZE) != PAGE_SIZE )
- {
- ERROR("ERROR when writing to state file (4)");
- goto out;
- }
- }
-
- sent_this_iter += batch;
-
- munmap(region_base, batch*PAGE_SIZE);
-
- } /* end of this while loop for this iteration */
-
- skip:
-
- total_sent += sent_this_iter;
-
- DPRINTF("\r %d: sent %d, skipped %d, ",
- iter, sent_this_iter, skip_this_iter );
-
- if ( last_iter )
- {
- print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
- DPRINTF("Total pages sent= %ld (%.2fx)\n",
- total_sent, ((float)total_sent)/pfn_array_size );
- }
-
- if ( last_iter && debug )
- {
- int minusone = -1;
- memset(to_send, 0xff, BITMAP_SIZE);
- debug = 0;
- DPRINTF("Entering debug resend-all mode\n");
-
- /* send "-1" to put receiver into debug mode */
- if ( !write_exact(io_fd, &minusone, sizeof(int)) )
- {
- ERROR("Error when writing to state file (6)");
- goto out;
- }
-
- continue;
- }
-
- if ( last_iter )
- break;
-
- if ( live )
- {
- if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
- (iter >= max_iters) ||
- (sent_this_iter+skip_this_iter < 50) ||
- (total_sent > pfn_array_size*max_factor) )
- {
- DPRINTF("Start last iteration for HVM domain\n");
- last_iter = 1;
-
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
- &ctxt))
- {
- ERROR("Domain appears not to have suspended");
- goto out;
- }
-
- DPRINTF("SUSPEND eip %08lx edx %08lx\n",
- (unsigned long)ctxt.user_regs.eip,
- (unsigned long)ctxt.user_regs.edx);
- }
-
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
- pfn_array_size, NULL,
- 0, &stats) != pfn_array_size )
- {
- ERROR("Error flushing shadow PT");
- goto out;
- }
-
- /* Pull in the dirty bits from qemu too */
- if ( !last_iter )
- {
- qemu_active = qemu_non_active;
- qemu_non_active = qemu_active ? 0 : 1;
- qemu_flip_buffer(dom, qemu_active);
- for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
- {
- to_send[j] |= qemu_bitmaps[qemu_non_active][j];
- qemu_bitmaps[qemu_non_active][j] = 0;
- }
- }
- else
- {
- for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
- to_send[j] |= qemu_bitmaps[qemu_active][j];
- }
-
- sent_last_iter = sent_this_iter;
-
- print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
- }
- } /* end of while 1 */
-
-
- DPRINTF("All HVM memory is saved\n");
-
- {
- struct {
- int minustwo;
- int max_vcpu_id;
- uint64_t vcpumap;
- } chunk = { -2, info.max_vcpu_id };
-
- if (info.max_vcpu_id >= 64) {
- ERROR("Too many VCPUS in guest!");
- goto out;
- }
-
- for (i = 1; i <= info.max_vcpu_id; i++) {
- xc_vcpuinfo_t vinfo;
- if ((xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
- vinfo.online)
- vcpumap |= 1ULL << i;
- }
-
- chunk.vcpumap = vcpumap;
- if(!write_exact(io_fd, &chunk, sizeof(chunk))) {
- ERROR("Error when writing to state file (errno %d)", errno);
- goto out;
- }
- }
-
- /* Zero terminate */
- i = 0;
- if ( !write_exact(io_fd, &i, sizeof(int)) )
- {
- ERROR("Error when writing to state file (6)");
- goto out;
- }
-
- /* Save magic-page locations. */
- memset(magic_pfns, 0, sizeof(magic_pfns));
- xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
- (unsigned long *)&magic_pfns[0]);
- xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
- (unsigned long *)&magic_pfns[1]);
- xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
- (unsigned long *)&magic_pfns[2]);
- if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
- {
- ERROR("Error when writing to state file (7)");
- goto out;
- }
-
- /* save vcpu/vmcs contexts */
- for ( i = 0; i < nr_vcpus; i++ )
- {
- if ( !(vcpumap & (1ULL << i)) )
- continue;
-
- if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
- {
- ERROR("HVM:Could not get vcpu context");
- goto out;
- }
-
- DPRINTF("write vcpu %d context.\n", i);
- if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
- {
- ERROR("write vcpu context failed!\n");
- goto out;
- }
- }
-
- if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
- hvm_buf_size)) == -1 )
- {
- ERROR("HVM:Could not get hvm buffer");
- goto out;
- }
-
- if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
- {
- ERROR("error write hvm buffer size");
- goto out;
- }
-
- if ( !write_exact(io_fd, hvm_buf, rec_size) )
- {
- ERROR("write HVM info failed!\n");
- goto out;
- }
-
- /* Success! */
- rc = 0;
-
- out:
-
- if ( live )
- {
- if ( xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
- NULL, 0, NULL, 0, NULL) < 0 )
- DPRINTF("Warning - couldn't disable shadow mode");
- }
-
- free(hvm_buf);
- free(pfn_batch);
- free(to_send);
- free(to_skip);
-
- return !!rc;
-}
diff -r 3d356a2b1c75 -r db4fcb609383 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c Wed Apr 11 07:30:02 2007 -0600
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,1414 +0,0 @@
-/******************************************************************************
- * xc_linux_save.c
- *
- * Save the state of a running Linux session.
- *
- * Copyright (c) 2003, K A Fraser.
- */
-
-#include <inttypes.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/time.h>
-
-#include "xc_private.h"
-#include "xc_dom.h"
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-/*
-** Default values for important tuning parameters. Can override by passing
-** non-zero replacement values to xc_linux_save().
-**
-** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
-**
-*/
-#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
-#define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
-
-/* max mfn of the whole machine */
-static unsigned long max_mfn;
-
-/* virtual starting address of the hypervisor */
-static unsigned long hvirt_start;
-
-/* #levels of page tables used by the current guest */
-static unsigned int pt_levels;
-
-/* number of pfns this guest has (i.e. number of entries in the P2M) */
-static unsigned long p2m_size;
-
-/* Live mapping of the table mapping each PFN to its current MFN. */
-static xen_pfn_t *live_p2m = NULL;
-
-/* Live mapping of system MFN to PFN table. */
-static xen_pfn_t *live_m2p = NULL;
-static unsigned long m2p_mfn0;
-
-/* grep fodder: machine_to_phys */
-
-#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
-
-/*
- * Returns TRUE if the given machine frame number has a unique mapping
- * in the guest's pseudophysical map.
- */
-#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
- (((_mfn) < (max_mfn)) && \
- ((mfn_to_pfn(_mfn) < (p2m_size)) && \
- (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
-
-/* Returns TRUE if MFN is successfully converted to a PFN. */
-#define translate_mfn_to_pfn(_pmfn) \
-({ \
- unsigned long mfn = *(_pmfn); \
- int _res = 1; \
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
- _res = 0; \
- else \
- *(_pmfn) = mfn_to_pfn(mfn); \
- _res; \
-})
-
-/*
-** During (live) save/migrate, we maintain a number of bitmaps to track
-** which pages we have to send, to fixup, and to skip.
-*/
-
-#define BITS_PER_LONG (sizeof(unsigned long) * 8)
-#define BITMAP_SIZE ((p2m_size + BITS_PER_LONG - 1) / 8)
-
-#define BITMAP_ENTRY(_nr,_bmap) \
- ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
-
-#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
-
-static inline int test_bit (int nr, volatile void * addr)
-{
- return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
-}
-
-static inline void clear_bit (int nr, volatile void * addr)
-{
- BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
-}
-
-static inline void set_bit ( int nr, volatile void * addr)
-{
- BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
-}
-
-/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
-static inline unsigned int hweight32(unsigned int w)
-{
- unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
- res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
- res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
- res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
- return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
-}
-
-static inline int count_bits ( int nr, volatile void *addr)
-{
- int i, count = 0;
- volatile unsigned long *p = (volatile unsigned long *)addr;
- /* We know that the array is padded to unsigned long. */
- for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
- count += hweight32(*p);
- return count;
-}
-
-static inline int permute( int i, int nr, int order_nr )
-{
- /* Need a simple permutation function so that we scan pages in a
- pseudo random order, enabling us to get a better estimate of
- the domain's page dirtying rate as we go (there are often
- contiguous ranges of pfns that have similar behaviour, and we
- want to mix them up. */
-
- /* e.g. nr->oder 15->4 16->4 17->5 */
- /* 512MB domain, 128k pages, order 17 */
-
- /*
- QPONMLKJIHGFEDCBA
- QPONMLKJIH
- GFEDCBA
- */
-
- /*
- QPONMLKJIHGFEDCBA
- EDCBA
- QPONM
- LKJIHGF
- */
-
- do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
- while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
-
- return i;
-}
-
-static uint64_t tv_to_us(struct timeval *new)
-{
- return (new->tv_sec * 1000000) + new->tv_usec;
-}
-
-static uint64_t llgettimeofday(void)
-{
- struct timeval now;
- gettimeofday(&now, NULL);
- return tv_to_us(&now);
-}
-
-static uint64_t tv_delta(struct timeval *new, struct timeval *old)
-{
- return (((new->tv_sec - old->tv_sec)*1000000) +
- (new->tv_usec - old->tv_usec));
-}
-
-static int noncached_write(int fd, int live, void *buffer, int len)
-{
- static int write_count = 0;
-
- int rc = write(fd,buffer,len);
-
- write_count += len;
- if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
- {
- /* Time to discard cache - dont care if this fails */
- discard_file_cache(fd, 0 /* no flush */);
- write_count = 0;
- }
-
- return rc;
-}
-
-#ifdef ADAPTIVE_SAVE
-
-/*
-** We control the rate at which we transmit (or save) to minimize impact
-** on running domains (including the target if we're doing live migrate).
-*/
-
-#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
-#define START_MBIT_RATE 100 /* initial transmit rate for migrate */
-
-/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
-#define RATE_TO_BTU 781250
-
-/* Amount in bytes we allow ourselves to send in a burst */
-#define BURST_BUDGET (100*1024)
-
-/* We keep track of the current and previous transmission rate */
-static int mbit_rate, ombit_rate = 0;
-
-/* Have we reached the maximum transmission rate? */
-#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
-
-static inline void initialize_mbit_rate()
-{
- mbit_rate = START_MBIT_RATE;
-}
-
-static int ratewrite(int io_fd, int live, void *buf, int n)
-{
- static int budget = 0;
- static int burst_time_us = -1;
- static struct timeval last_put = { 0 };
- struct timeval now;
- struct timespec delay;
- long long delta;
-
- if ( START_MBIT_RATE == 0 )
- return noncached_write(io_fd, live, buf, n);
-
- budget -= n;
- if ( budget < 0 )
- {
- if ( mbit_rate != ombit_rate )
- {
- burst_time_us = RATE_TO_BTU / mbit_rate;
- ombit_rate = mbit_rate;
- DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
- mbit_rate, BURST_BUDGET, burst_time_us);
- }
- if ( last_put.tv_sec == 0 )
- {
- budget += BURST_BUDGET;
- gettimeofday(&last_put, NULL);
- }
- else
- {
- while ( budget < 0 )
- {
- gettimeofday(&now, NULL);
- delta = tv_delta(&now, &last_put);
- while ( delta > burst_time_us )
- {
- budget += BURST_BUDGET;
- last_put.tv_usec += burst_time_us;
- if ( last_put.tv_usec > 1000000
- {
- last_put.tv_usec -= 1000000;
- last_put.tv_sec++;
- }
- delta -= burst_time_us;
- }
- if ( budget > 0 )
- break;
- delay.tv_sec = 0;
- delay.tv_nsec = 1000 * (burst_time_us - delta);
- while ( delay.tv_nsec > 0 )
- if ( nanosleep(&delay, &delay) == 0 )
- break;
- }
- }
- }
- return noncached_write(io_fd, live, buf, n);
-}
-
-#else /* ! ADAPTIVE SAVE */
-
-#define RATE_IS_MAX() (0)
-#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live),
(_buf), (_n))
-#define initialize_mbit_rate()
-
-#endif
-
-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
- return (write(fd, buf, count) == count);
-}
-
-static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
- xc_shadow_op_stats_t *stats, int print)
-{
- static struct timeval wall_last;
- static long long d0_cpu_last;
- static long long d1_cpu_last;
-
- struct timeval wall_now;
- long long wall_delta;
- long long d0_cpu_now, d0_cpu_delta;
- long long d1_cpu_now, d1_cpu_delta;
-
- gettimeofday(&wall_now, NULL);
-
- d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
- d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
-
- if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
- DPRINTF("ARRHHH!!\n");
-
- wall_delta = tv_delta(&wall_now,&wall_last)/1000;
- if ( wall_delta == 0 )
- wall_delta = 1;
-
- d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
- d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
-
- if ( print )
- DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
- "dirtied %dMb/s %" PRId32 " pages\n",
- wall_delta,
- (int)((d0_cpu_delta*100)/wall_delta),
- (int)((d1_cpu_delta*100)/wall_delta),
- (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
- (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
- stats->dirty_count);
-
-#ifdef ADAPTIVE_SAVE
- if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
- {
- mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
- + 50;
- if ( mbit_rate > MAX_MBIT_RATE )
- mbit_rate = MAX_MBIT_RATE;
- }
-#endif
-
- d0_cpu_last = d0_cpu_now;
- d1_cpu_last = d1_cpu_now;
- wall_last = wall_now;
-
- return 0;
-}
-
-
-static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
- unsigned long *arr, int runs)
-{
- long long start, now;
- xc_shadow_op_stats_t stats;
- int j;
-
- start = llgettimeofday();
-
- for ( j = 0; j < runs; j++ )
- {
- int i;
-
- xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
- arr, p2m_size, NULL, 0, NULL);
- DPRINTF("#Flush\n");
- for ( i = 0; i < 40; i++ )
- {
- usleep(50000);
- now = llgettimeofday();
- xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
- NULL, 0, NULL, 0, &stats);
- DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
- ((now-start)+500)/1000,
- stats.fault_count, stats.dirty_count);
- }
- }
-
- return -1;
-}
-
-
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
- int dom, xc_dominfo_t *info,
- vcpu_guest_context_t *ctxt)
-{
- int i = 0;
-
- if ( !(*suspend)(dom) )
- {
- ERROR("Suspend request failed");
- return -1;
- }
-
- retry:
-
- if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
- {
- ERROR("Could not get domain info");
- return -1;
- }
-
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
- ERROR("Could not get vcpu context");
-
-
- if ( info->dying )
- {
- ERROR("domain is dying");
- return -1;
- }
-
- if ( info->crashed )
- {
- ERROR("domain has crashed");
- return -1;
- }
-
- if ( info->shutdown )
- {
- switch ( info->shutdown_reason )
- {
- case SHUTDOWN_poweroff:
- case SHUTDOWN_reboot:
- ERROR("domain has shut down");
- return -1;
- case SHUTDOWN_suspend:
- return 0;
- case SHUTDOWN_crash:
- ERROR("domain has crashed");
- return -1;
- }
- }
-
- if ( info->paused )
- {
- /* Try unpausing domain, wait, and retest. */
- xc_domain_unpause( xc_handle, dom );
- ERROR("Domain was paused. Wait and re-test.");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- if ( ++i < 100 )
- {
- ERROR("Retry suspend domain");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- ERROR("Unable to suspend domain.");
-
- return -1;
-}
-
-/*
-** Map the top-level page of MFNs from the guest. The guest might not have
-** finished resuming from a previous restore operation, so we wait a while for
-** it to update the MFN to a reasonable value.
-*/
-static void *map_frame_list_list(int xc_handle, uint32_t dom,
- shared_info_t *shinfo)
-{
- int count = 100;
- void *p;
-
- while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
- usleep(10000);
-
- if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
- {
- ERROR("Timed out waiting for frame list updated.");
- return NULL;
- }
-
- p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
- shinfo->arch.pfn_to_mfn_frame_list_list);
- if ( p == NULL )
- ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
-
- return p;
-}
-
-/*
-** During transfer (or in the state file), all page-table pages must be
-** converted into a 'canonical' form where references to actual mfns
-** are replaced with references to the corresponding pfns.
-**
-** This function performs the appropriate conversion, taking into account
-** which entries do not require canonicalization (in particular, those
-** entries which map the virtual address reserved for the hypervisor).
-*/
-static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
- const void *spage, void *dpage)
-{
-
- int i, pte_last, xen_start, xen_end, race = 0;
- uint64_t pte;
-
- /*
- ** We need to determine which entries in this page table hold
- ** reserved hypervisor mappings. This depends on the current
- ** page table type as well as the number of paging levels.
- */
- xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
-
- if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
- xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
-
- if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
- xen_start = L3_PAGETABLE_ENTRIES_PAE;
-
- /*
- ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
- ** We can spot this by looking for the guest linear mapping which
- ** Xen always ensures is present in that L2. Guests must ensure
- ** that this check will fail for other L2s.
- */
- if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
- {
- int hstart;
- uint64_t he;
-
- hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
- he = ((const uint64_t *) spage)[hstart];
-
- if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
- {
- /* hvirt starts with xen stuff... */
- xen_start = hstart;
- }
- else if ( hvirt_start != 0xf5800000 )
- {
- /* old L2s from before hole was shrunk... */
- hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
- he = ((const uint64_t *) spage)[hstart];
- if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
- xen_start = hstart;
- }
- }
-
- if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
- {
- /*
- ** XXX SMH: should compute these from hvirt_start (which we have)
- ** and hvirt_end (which we don't)
- */
- xen_start = 256;
- xen_end = 272;
- }
-
- /* Now iterate through the page table, canonicalizing each PTE */
- for (i = 0; i < pte_last; i++ )
- {
- unsigned long pfn, mfn;
-
- if ( pt_levels == 2 )
- pte = ((const uint32_t*)spage)[i];
- else
- pte = ((const uint64_t*)spage)[i];
-
- if ( (i >= xen_start) && (i < xen_end) )
- pte = 0;
-
- if ( pte & _PAGE_PRESENT )
- {
- mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
- {
- /* This will happen if the type info is stale which
- is quite feasible under live migration */
- pfn = 0; /* zap it - we'll retransmit this page later */
- race = 1; /* inform the caller of race; fatal if !live */
- }
- else
- pfn = mfn_to_pfn(mfn);
-
- pte &= ~MADDR_MASK_X86;
- pte |= (uint64_t)pfn << PAGE_SHIFT;
-
- /*
- * PAE guest L3Es can contain these flags when running on
- * a 64bit hypervisor. We zap these here to avoid any
- * surprise at restore time...
- */
- if ( (pt_levels == 3) &&
- (type == XEN_DOMCTL_PFINFO_L3TAB) &&
- (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
- pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
- }
-
- if ( pt_levels == 2 )
- ((uint32_t*)dpage)[i] = pte;
- else
- ((uint64_t*)dpage)[i] = pte;
- }
-
- return race;
-}
-
-static xen_pfn_t *xc_map_m2p(int xc_handle,
- unsigned long max_mfn,
- int prot)
-{
- struct xen_machphys_mfn_list xmml;
- privcmd_mmap_entry_t *entries;
- unsigned long m2p_chunks, m2p_size;
- xen_pfn_t *m2p;
- xen_pfn_t *extent_start;
- int i, rc;
-
- m2p_size = M2P_SIZE(max_mfn);
- m2p_chunks = M2P_CHUNKS(max_mfn);
-
- xmml.max_extents = m2p_chunks;
- if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
- {
- ERROR("failed to allocate space for m2p mfns");
- return NULL;
- }
- set_xen_guest_handle(xmml.extent_start, extent_start);
-
- if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
- (xmml.nr_extents != m2p_chunks) )
- {
- ERROR("xc_get_m2p_mfns");
- return NULL;
- }
-
- if ( (m2p = mmap(NULL, m2p_size, prot,
- MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
- {
- ERROR("failed to mmap m2p");
- return NULL;
- }
-
- if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
- {
- ERROR("failed to allocate space for mmap entries");
- return NULL;
- }
-
- for ( i = 0; i < m2p_chunks; i++ )
- {
- entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
- entries[i].mfn = extent_start[i];
- entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
- }
-
- if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
- entries, m2p_chunks)) < 0 )
- {
- ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
- return NULL;
- }
-
- m2p_mfn0 = entries[0].mfn;
-
- free(extent_start);
- free(entries);
-
- return m2p;
-}
-
-int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(int))
-{
- xc_dominfo_t info;
-
- int rc = 1, i, j, last_iter, iter = 0;
- int live = (flags & XCFLAGS_LIVE);
- int debug = (flags & XCFLAGS_DEBUG);
- int race = 0, sent_last_iter, skip_this_iter;
-
- /* The new domain's shared-info frame number. */
- unsigned long shared_info_frame;
-
- /* A copy of the CPU context of the guest. */
- vcpu_guest_context_t ctxt;
-
- /* A table containg the type of each PFN (/not/ MFN!). */
- unsigned long *pfn_type = NULL;
- unsigned long *pfn_batch = NULL;
-
- /* A temporary mapping, and a copy, of one frame of guest memory. */
- char page[PAGE_SIZE];
-
- /* Double and single indirect references to the live P2M table */
- xen_pfn_t *live_p2m_frame_list_list = NULL;
- xen_pfn_t *live_p2m_frame_list = NULL;
-
- /* A copy of the pfn-to-mfn table frame list. */
- xen_pfn_t *p2m_frame_list = NULL;
-
- /* Live mapping of shared info structure */
- shared_info_t *live_shinfo = NULL;
-
- /* base of the region in which domain memory is mapped */
- unsigned char *region_base = NULL;
-
- /* power of 2 order of p2m_size */
- int order_nr;
-
- /* bitmap of pages:
- - that should be sent this iteration (unless later marked as skip);
- - to skip this iteration because already dirty;
- - to fixup by sending at the end if not already resent; */
- unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
-
- xc_shadow_op_stats_t stats;
-
- unsigned long needed_to_fix = 0;
- unsigned long total_sent = 0;
-
- uint64_t vcpumap = 1ULL;
-
- /* If no explicit control parameters given, use defaults */
- max_iters = max_iters ? : DEF_MAX_ITERS;
- max_factor = max_factor ? : DEF_MAX_FACTOR;
-
- initialize_mbit_rate();
-
- if ( !get_platform_info(xc_handle, dom,
- &max_mfn, &hvirt_start, &pt_levels) )
- {
- ERROR("Unable to get platform info.");
- return 1;
- }
-
- if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
- {
- ERROR("Could not get domain info");
- return 1;
- }
-
- if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
- {
- ERROR("Could not get vcpu context");
- goto out;
- }
- shared_info_frame = info.shared_info_frame;
-
- /* Map the shared info frame */
- if ( !(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
- PROT_READ, shared_info_frame)) )
- {
- ERROR("Couldn't map live_shinfo");
- goto out;
- }
-
- p2m_size = live_shinfo->arch.max_pfn;
-
- live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
- live_shinfo);
- if ( !live_p2m_frame_list_list )
- goto out;
-
- live_p2m_frame_list =
- xc_map_foreign_batch(xc_handle, dom, PROT_READ,
- live_p2m_frame_list_list,
- P2M_FLL_ENTRIES);
- if ( !live_p2m_frame_list )
- {
- ERROR("Couldn't map p2m_frame_list");
- goto out;
- }
-
- /* Map all the frames of the pfn->mfn table. For migrate to succeed,
- the guest must not change which frames are used for this purpose.
- (its not clear why it would want to change them, and we'll be OK
- from a safety POV anyhow. */
-
- live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
- live_p2m_frame_list,
- P2M_FL_ENTRIES);
- if ( !live_p2m )
- {
- ERROR("Couldn't map p2m table");
- goto out;
- }
-
- /* Setup the mfn_to_pfn table mapping */
- if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
- {
- ERROR("Failed to map live M2P table");
- goto out;
- }
-
-
- /* Get a local copy of the live_P2M_frame_list */
- if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
- {
- ERROR("Couldn't allocate p2m_frame_list array");
- goto out;
- }
- memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
-
- /* Canonicalise the pfn-to-mfn table frame-number list. */
- for ( i = 0; i < p2m_size; i += fpp )
- {
- if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
- {
- ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
- ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
- (uint64_t)p2m_frame_list[i/fpp]);
- goto out;
- }
- }
-
- /* Domain is still running at this point */
- if ( live )
- {
- /* Live suspend. Enable log-dirty mode. */
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL, 0, NULL) < 0 )
- {
- ERROR("Couldn't enable shadow mode");
- goto out;
- }
- }
- else
- {
- /* This is a non-live suspend. Suspend the domain .*/
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
- {
- ERROR("Domain appears not to have suspended");
- goto out;
- }
- }
-
- last_iter = !live;
-
- /* pretend we sent all the pages last iteration */
- sent_last_iter = p2m_size;
-
- /* calculate the power of 2 order of p2m_size, e.g.
- 15->4 16->4 17->5 */
- for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
- continue;
-
- /* Setup to_send / to_fix and to_skip bitmaps */
- to_send = malloc(BITMAP_SIZE);
- to_fix = calloc(1, BITMAP_SIZE);
- to_skip = malloc(BITMAP_SIZE);
-
- if ( !to_send || !to_fix || !to_skip )
- {
- ERROR("Couldn't allocate to_send array");
- goto out;
- }
-
- memset(to_send, 0xff, BITMAP_SIZE);
-
- if ( lock_pages(to_send, BITMAP_SIZE) )
- {
- ERROR("Unable to lock to_send");
- return 1;
- }
-
- /* (to fix is local only) */
- if ( lock_pages(to_skip, BITMAP_SIZE) )
- {
- ERROR("Unable to lock to_skip");
- return 1;
- }
-
- analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
-
- /* We want zeroed memory so use calloc rather than malloc. */
- pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
- pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
- if ( (pfn_type == NULL) || (pfn_batch == NULL) )
- {
- ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
- errno = ENOMEM;
- goto out;
- }
-
- if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
- {
- ERROR("Unable to lock");
- goto out;
- }
-
- /*
- * Quick belt and braces sanity check.
- */
- {
- int err=0;
- unsigned long mfn;
- for ( i = 0; i < p2m_size; i++ )
- {
- mfn = live_p2m[i];
- if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
- {
- DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
- mfn, mfn_to_pfn(mfn));
- err++;
- }
- }
- DPRINTF("Had %d unexplained entries in p2m table\n", err);
- }
-
- /* Start writing out the saved-domain record. */
- if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
- {
- ERROR("write: p2m_size");
- goto out;
- }
-
- /*
- * Write an extended-info structure to inform the restore code that
- * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
- * slow paths in the restore code.
- */
- if ( (pt_levels == 3) &&
- (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
- {
- unsigned long signature = ~0UL;
- uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8;
- uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
- char chunk_sig[] = "vcpu";
- if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
- !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
- !write_exact(io_fd, &chunk_sig, 4) ||
- !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) ||
- !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
- {
- ERROR("write: extended info");
- goto out;
- }
- }
-
- if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
- {
- ERROR("write: p2m_frame_list");
- goto out;
- }
-
- print_stats(xc_handle, dom, 0, &stats, 0);
-
- /* Now write out each data page, canonicalising page tables as we go... */
- for ( ; ; )
- {
- unsigned int prev_pc, sent_this_iter, N, batch;
-
- iter++;
- sent_this_iter = 0;
- skip_this_iter = 0;
- prev_pc = 0;
- N = 0;
-
- DPRINTF("Saving memory pages: iter %d 0%%", iter);
-
- while ( N < p2m_size )
- {
- unsigned int this_pc = (N * 100) / p2m_size;
- int rc;
-
- if ( (this_pc - prev_pc) >= 5 )
- {
- DPRINTF("\b\b\b\b%3d%%", this_pc);
- prev_pc = this_pc;
- }
-
- if ( !last_iter )
- {
- /* Slightly wasteful to peek the whole array evey time,
- but this is fast enough for the moment. */
- rc = xc_shadow_control(
- xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
- p2m_size, NULL, 0, NULL);
- if ( rc != p2m_size )
- {
- ERROR("Error peeking shadow bitmap");
- goto out;
- }
- }
-
- /* load pfn_type[] with the mfn of all the pages we're doing in
- this batch. */
- for ( batch = 0;
- (batch < MAX_BATCH_SIZE) && (N < p2m_size);
- N++ )
- {
- int n = permute(N, p2m_size, order_nr);
-
- if ( debug )
- DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
- iter, (unsigned long)n, live_p2m[n],
- test_bit(n, to_send),
- mfn_to_pfn(live_p2m[n]&0xFFFFF));
-
- if ( !last_iter &&
- test_bit(n, to_send) &&
- test_bit(n, to_skip) )
- skip_this_iter++; /* stats keeping */
-
- if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
- (test_bit(n, to_send) && last_iter) ||
- (test_bit(n, to_fix) && last_iter)) )
- continue;
-
- /*
- ** we get here if:
- ** 1. page is marked to_send & hasn't already been re-dirtied
- ** 2. (ignore to_skip in last iteration)
- ** 3. add in pages that still need fixup (net bufs)
- */
-
- pfn_batch[batch] = n;
- pfn_type[batch] = live_p2m[n];
-
- if ( !is_mapped(pfn_type[batch]) )
- {
- /*
- ** not currently in psuedo-physical map -- set bit
- ** in to_fix since we must send this page in last_iter
- ** unless its sent sooner anyhow, or it never enters
- ** pseudo-physical map (e.g. for ballooned down domains)
- */
- set_bit(n, to_fix);
- continue;
- }
-
- if ( last_iter &&
- test_bit(n, to_fix) &&
- !test_bit(n, to_send) )
- {
- needed_to_fix++;
- DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
- iter, n, pfn_type[batch]);
- }
-
- clear_bit(n, to_fix);
-
- batch++;
- }
-
- if ( batch == 0 )
- goto skip; /* vanishingly unlikely... */
-
- region_base = xc_map_foreign_batch(
- xc_handle, dom, PROT_READ, pfn_type, batch);
- if ( region_base == NULL )
- {
- ERROR("map batch failed");
- goto out;
- }
-
- for ( j = 0; j < batch; j++ )
- ((uint32_t *)pfn_type)[j] = pfn_type[j];
- if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
- (uint32_t *)pfn_type) )
- {
- ERROR("get_pfn_type_batch failed");
- goto out;
- }
- for ( j = batch-1; j >= 0; j-- )
- pfn_type[j] = ((uint32_t *)pfn_type)[j];
-
- for ( j = 0; j < batch; j++ )
- {
-
- if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
- XEN_DOMCTL_PFINFO_XTAB )
- {
- DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
- continue;
- }
-
- if ( debug )
- DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
- " sum= %08lx\n",
- iter,
- (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
- pfn_batch[j],
- pfn_type[j],
- mfn_to_pfn(pfn_type[j] &
- ~XEN_DOMCTL_PFINFO_LTAB_MASK),
- csum_page(region_base + (PAGE_SIZE*j)));
-
- /* canonicalise mfn->pfn */
- pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
- pfn_batch[j];
- }
-
- if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
- {
- ERROR("Error when writing to state file (2) (errno %d)",
- errno);
- goto out;
- }
-
- if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*j) )
- {
- ERROR("Error when writing to state file (3) (errno %d)",
- errno);
- goto out;
- }
-
- /* entering this loop, pfn_type is now in pfns (Not mfns) */
- for ( j = 0; j < batch; j++ )
- {
- unsigned long pfn, pagetype;
- void *spage = (char *)region_base + (PAGE_SIZE*j);
-
- pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- /* write out pages in batch */
- if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
- continue;
-
- pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
- if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
- (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
- {
- /* We have a pagetable page: need to rewrite it. */
- race =
- canonicalize_pagetable(pagetype, pfn, spage, page);
-
- if ( race && !live )
- {
- ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
- pagetype);
- goto out;
- }
-
- if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
- {
- ERROR("Error when writing to state file (4)"
- " (errno %d)", errno);
- goto out;
- }
- }
- else
- {
- /* We have a normal page: just write it directly. */
- if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
- PAGE_SIZE )
- {
- ERROR("Error when writing to state file (5)"
- " (errno %d)", errno);
- goto out;
- }
- }
- } /* end of the write out for this batch */
-
- sent_this_iter += batch;
-
- munmap(region_base, batch*PAGE_SIZE);
-
- } /* end of this while loop for this iteration */
-
- skip:
-
- total_sent += sent_this_iter;
-
- DPRINTF("\r %d: sent %d, skipped %d, ",
- iter, sent_this_iter, skip_this_iter );
-
- if ( last_iter )
- {
- print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
-
- DPRINTF("Total pages sent= %ld (%.2fx)\n",
- total_sent, ((float)total_sent)/p2m_size );
- DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
- }
-
- if ( last_iter && debug )
- {
- int minusone = -1;
- memset(to_send, 0xff, BITMAP_SIZE);
- debug = 0;
- DPRINTF("Entering debug resend-all mode\n");
-
- /* send "-1" to put receiver into debug mode */
- if ( !write_exact(io_fd, &minusone, sizeof(int)) )
- {
- ERROR("Error when writing to state file (6) (errno %d)",
- errno);
- goto out;
- }
-
- continue;
- }
-
- if ( last_iter )
- break;
-
- if ( live )
- {
- if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
- (iter >= max_iters) ||
- (sent_this_iter+skip_this_iter < 50) ||
- (total_sent > p2m_size*max_factor) )
- {
- DPRINTF("Start last iteration\n");
- last_iter = 1;
-
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
- &ctxt) )
- {
- ERROR("Domain appears not to have suspended");
- goto out;
- }
-
- DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
- info.shared_info_frame,
- (unsigned long)ctxt.user_regs.eip,
- (unsigned long)ctxt.user_regs.edx);
- }
-
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
- p2m_size, NULL, 0, &stats) != p2m_size )
- {
- ERROR("Error flushing shadow PT");
- goto out;
- }
-
- sent_last_iter = sent_this_iter;
-
- print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
-
- }
- } /* end of infinite for loop */
-
- DPRINTF("All memory is saved\n");
-
- {
- struct {
- int minustwo;
- int max_vcpu_id;
- uint64_t vcpumap;
- } chunk = { -2, info.max_vcpu_id };
-
- if ( info.max_vcpu_id >= 64 )
- {
- ERROR("Too many VCPUS in guest!");
- goto out;
- }
-
- for ( i = 1; i <= info.max_vcpu_id; i++ )
- {
- xc_vcpuinfo_t vinfo;
- if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
- vinfo.online )
- vcpumap |= 1ULL << i;
- }
-
- chunk.vcpumap = vcpumap;
- if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
- {
- ERROR("Error when writing to state file (errno %d)", errno);
- goto out;
- }
- }
-
- /* Zero terminate */
- i = 0;
- if ( !write_exact(io_fd, &i, sizeof(int)) )
- {
- ERROR("Error when writing to state file (6') (errno %d)", errno);
- goto out;
- }
-
- /* Send through a list of all the PFNs that were not in map at the close */
- {
- unsigned int i,j;
- unsigned long pfntab[1024];
-
- for ( i = 0, j = 0; i < p2m_size; i++ )
- {
- if ( !is_mapped(live_p2m[i]) )
- j++;
- }
-
- if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
- {
- ERROR("Error when writing to state file (6a) (errno %d)", errno);
- goto out;
- }
-
- for ( i = 0, j = 0; i < p2m_size; )
- {
- if ( !is_mapped(live_p2m[i]) )
- pfntab[j++] = i;
-
- i++;
- if ( (j == 1024) || (i == p2m_size) )
- {
- if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
- {
- ERROR("Error when writing to state file (6b) (errno %d)",
- errno);
- goto out;
- }
- j = 0;
- }
- }
- }
-
- /* Canonicalise the suspend-record frame number. */
- if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
- {
- ERROR("Suspend record is not in range of pseudophys map");
- goto out;
- }
-
- for ( i = 0; i <= info.max_vcpu_id; i++ )
- {
- if ( !(vcpumap & (1ULL << i)) )
- continue;
-
- if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
- {
- ERROR("No context for VCPU%d", i);
- goto out;
- }
-
- /* Canonicalise each GDT frame number. */
- for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
- {
- if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
- {
- ERROR("GDT frame is not in range of pseudophys map");
- goto out;
- }
- }
-
- /* Canonicalise the page table base pointer. */
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
- {
- ERROR("PT base is not in range of pseudophys map");
- goto out;
- }
- ctxt.ctrlreg[3] =
- xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
-
- /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
- if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
- {
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
- {
- ERROR("PT base is not in range of pseudophys map");
- goto out;
- }
- /* Least-significant bit means 'valid PFN'. */
- ctxt.ctrlreg[1] = 1 |
- xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
- }
-
- if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
- {
- ERROR("Error when writing to state file (1) (errno %d)", errno);
- goto out;
- }
- }
-
- /*
- * Reset the MFN to be a known-invalid value. See map_frame_list_list().
- */
- memcpy(page, live_shinfo, PAGE_SIZE);
- ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
- if ( !write_exact(io_fd, page, PAGE_SIZE) )
- {
- ERROR("Error when writing to state file (1) (errno %d)", errno);
- goto out;
- }
-
- /* Success! */
- rc = 0;
-
- out:
-
- if ( live )
- {
- if ( xc_shadow_control(xc_handle, dom,
- XEN_DOMCTL_SHADOW_OP_OFF,
- NULL, 0, NULL, 0, NULL) < 0 )
- DPRINTF("Warning - couldn't disable shadow mode");
- }
-
- /* Flush last write and discard cache for file. */
- discard_file_cache(io_fd, 1 /* flush */);
-
- if ( live_shinfo )
- munmap(live_shinfo, PAGE_SIZE);
-
- if ( live_p2m_frame_list_list )
- munmap(live_p2m_frame_list_list, PAGE_SIZE);
-
- if ( live_p2m_frame_list )
- munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
-
- if ( live_p2m )
- munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
-
- if ( live_m2p )
- munmap(live_m2p, M2P_SIZE(max_mfn));
-
- free(pfn_type);
- free(pfn_batch);
- free(to_send);
- free(to_fix);
- free(to_skip);
-
- DPRINTF("Save exit rc=%d\n",rc);
-
- return !!rc;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff -r 3d356a2b1c75 -r db4fcb609383 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/libxc/xenguest.h Wed Apr 11 15:45:29 2007 +0100
@@ -16,26 +16,19 @@
/**
- * This function will save a domain running Linux.
+ * This function will save a running domain.
*
* @parm xc_handle a handle to an open hypervisor interface
* @parm fd the file descriptor to save a domain to
* @parm dom the id of the domain
* @return 0 on success, -1 on failure
*/
-int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
- int (*suspend)(int domid));
+int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+ uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
+ int (*suspend)(int domid), int hvm,
+ void *(*init_qemu_maps)(int, unsigned), /* HVM only */
+ void (*qemu_flip_buffer)(int, int)); /* HVM only */
-/**
- * This function will save a hvm domain running unmodified guest.
- * @return 0 on success, -1 on failure
- */
-int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
- int (*suspend)(int domid),
- void *(*init_qemu_maps)(int, unsigned),
- void (*qemu_flip_buffer)(int, int));
/**
* This function will restore a saved domain.
diff -r 3d356a2b1c75 -r db4fcb609383 tools/libxc/xg_private.c
--- a/tools/libxc/xg_private.c Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/libxc/xg_private.c Wed Apr 11 15:45:29 2007 +0100
@@ -193,17 +193,6 @@ __attribute__((weak))
uint32_t domid,
int memsize,
const char *image_name)
-{
- errno = ENOSYS;
- return -1;
-}
-
-__attribute__((weak))
- int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags,
- int (*suspend)(int domid),
- void *(*init_qemu_maps)(int, unsigned),
- void (*qemu_flip_buffer)(int, int))
{
errno = ENOSYS;
return -1;
diff -r 3d356a2b1c75 -r db4fcb609383 tools/pygrub/src/LiloConf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/pygrub/src/LiloConf.py Wed Apr 11 15:45:29 2007 +0100
@@ -0,0 +1,147 @@
+#
+#LiloConf.py
+#
+
+import sys, re, os
+import logging
+import GrubConf
+
+class LiloImage(object):
+ def __init__(self, lines, path):
+ self.reset(lines, path)
+
+ def __repr__(self):
+ return ("title: %s\n"
+ " root: %s\n"
+ " kernel: %s\n"
+ " args: %s\n"
+ " initrd: %s\n" %(self.title, self.root, self.kernel,
+ self.args, self.initrd))
+ def reset(self, lines, path):
+ self._root = self._initrd = self._kernel = self._args = None
+ self.title = ""
+ self.lines = []
+ self.path = path
+ map(self.set_from_line, lines)
+ self.root = "" # dummy
+
+ def set_from_line(self, line, replace = None):
+ (com, arg) = GrubConf.grub_exact_split(line, 2)
+
+ if self.commands.has_key(com):
+ if self.commands[com] is not None:
+ exec("%s = r\'%s\'" %(self.commands[com], re.sub('^"(.+)"$',
r"\1", arg.strip())))
+ else:
+ logging.info("Ignored image directive %s" %(com,))
+ else:
+ logging.warning("Unknown image directive %s" %(com,))
+
+ # now put the line in the list of lines
+ if replace is None:
+ self.lines.append(line)
+ else:
+ self.lines.pop(replace)
+ self.lines.insert(replace, line)
+
+ def set_kernel(self, val):
+ self._kernel = (None, self.path + "/" + val)
+ def get_kernel(self):
+ return self._kernel
+ kernel = property(get_kernel, set_kernel)
+
+ def set_initrd(self, val):
+ self._initrd = (None, self.path + "/" + val)
+ def get_initrd(self):
+ return self._initrd
+ initrd = property(get_initrd, set_initrd)
+
+ # set up command handlers
+ commands = { "label": "self.title",
+ "root": "self.root",
+ "rootnoverify": "self.root",
+ "image": "self.kernel",
+ "initrd": "self.initrd",
+ "append": "self.args",
+ "read-only": None,
+ "chainloader": None,
+ "module": None}
+
+class LiloConfigFile(object):
+ def __init__(self, fn = None):
+ self.filename = fn
+ self.images = []
+ self.timeout = -1
+ self._default = 0
+
+ if fn is not None:
+ self.parse()
+
+ def parse(self, buf = None):
+ if buf is None:
+ if self.filename is None:
+ raise ValueError, "No config file defined to parse!"
+
+ f = open(self.filename, 'r')
+ lines = f.readlines()
+ f.close()
+ else:
+ lines = buf.split("\n")
+
+ path = os.path.dirname(self.filename)
+ img = []
+ for l in lines:
+ l = l.strip()
+ # skip blank lines
+ if len(l) == 0:
+ continue
+ # skip comments
+ if l.startswith('#'):
+ continue
+ # new image
+ if l.startswith("image"):
+ if len(img) > 0:
+ self.add_image(LiloImage(img, path))
+ img = [l]
+ continue
+
+ if len(img) > 0:
+ img.append(l)
+ continue
+
+ (com, arg) = GrubConf.grub_exact_split(l, 2)
+ if self.commands.has_key(com):
+ if self.commands[com] is not None:
+ exec("%s = r\"%s\"" %(self.commands[com], arg.strip()))
+ else:
+ logging.info("Ignored directive %s" %(com,))
+ else:
+ logging.warning("Unknown directive %s" %(com,))
+
+ if len(img) > 0:
+ self.add_image(LiloImage(img, path))
+
+ def add_image(self, image):
+ self.images.append(image)
+
+ def _get_default(self):
+ for i in range(0, len(self.images) - 1):
+ if self.images[i].title == self._default:
+ return i
+ return 0
+ def _set_default(self, val):
+ self._default = val
+ default = property(_get_default, _set_default)
+
+ commands = { "default": "self.default",
+ "timeout": "self.timeout",
+ "prompt": None,
+ "relocatable": None,
+ }
+
+if __name__ == "__main__":
+ if sys.argv < 2:
+ raise RuntimeError, "Need a grub.conf to read"
+ g = LiloConfigFile(sys.argv[1])
+ for i in g.images:
+ print i #, i.title, i.root, i.kernel, i.args, i.initrd
+ print g.default
diff -r 3d356a2b1c75 -r db4fcb609383 tools/pygrub/src/pygrub
--- a/tools/pygrub/src/pygrub Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/pygrub/src/pygrub Wed Apr 11 15:45:29 2007 +0100
@@ -16,6 +16,7 @@ import os, sys, string, struct, tempfile
import os, sys, string, struct, tempfile, re
import copy
import logging
+import platform
import curses, _curses, curses.wrapper, curses.textpad, curses.ascii
import getopt
@@ -24,6 +25,7 @@ sys.path = [ '/usr/lib/python' ] + sys.p
import fsimage
import grub.GrubConf
+import grub.LiloConf
PYGRUB_VER = 0.5
@@ -58,6 +60,13 @@ def get_active_partition(file):
# active partition has 0x80 as the first byte
if struct.unpack("<c", buf[poff:poff+1]) == ('\x80',):
return buf[poff:poff+16]
+
+ # type=0xee: GUID partition table
+ # XXX assume the first partition is active
+ if struct.unpack("<c", buf[poff+4:poff+5]) == ('\xee',):
+ os.lseek(fd, 0x400, 0)
+ buf = os.read(fd, 512)
+ return buf[24:40] # XXX buf[32:40]
# if there's not a partition marked as active, fall back to
# the first partition
@@ -346,7 +355,13 @@ class Grub:
if not os.access(fn, os.R_OK):
raise RuntimeError, "Unable to access %s" %(fn,)
- self.cf = grub.GrubConf.GrubConfigFile()
+ if platform.machine() == 'ia64':
+ self.cf = grub.LiloConf.LiloConfigFile()
+ file_list = ("/efi/redhat/elilo.conf",)
+ else:
+ self.cf = grub.GrubConf.GrubConfigFile()
+ file_list = ("/boot/grub/menu.lst", "/boot/grub/grub.conf",
+ "/grub/menu.lst", "/grub/grub.conf")
if not fs:
# set the config file and parse it
@@ -354,18 +369,15 @@ class Grub:
self.cf.parse()
return
- grubfile = None
- for f in ("/boot/grub/menu.lst", "/boot/grub/grub.conf",
- "/grub/menu.lst", "/grub/grub.conf"):
+ for f in file_list:
if fs.file_exists(f):
- grubfile = f
- break
- if grubfile is None:
- raise RuntimeError, "we couldn't find grub config file in the
image provided."
- f = fs.open_file(grubfile)
+ self.cf.filename = f
+ break
+ if self.cf.filename is None:
+ raise RuntimeError, "couldn't find bootloader config file in the
image provided."
+ f = fs.open_file(self.cf.filename)
buf = f.read()
del f
- # then parse the grub config
self.cf.parse(buf)
def run(self):
diff -r 3d356a2b1c75 -r db4fcb609383 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/python/xen/xend/XendCheckpoint.py Wed Apr 11 15:45:29 2007 +0100
@@ -75,13 +75,6 @@ def save(fd, dominfo, network, live, dst
image_cfg = dominfo.info.get('image', {})
hvm = dominfo.info.is_hvm()
- stdvga = 0
-
- if hvm:
- log.info("save hvm domain")
- if dominfo.info['platform'].has_key('stdvga'):
- if dominfo.info['platform']['stdvga'] == 1:
- stdvga = 1
# xc_save takes three customization parameters: maxit, max_f, and
# flags the last controls whether or not save is 'live', while the
diff -r 3d356a2b1c75 -r db4fcb609383
tools/python/xen/xend/server/DevController.py
--- a/tools/python/xen/xend/server/DevController.py Wed Apr 11 07:30:02
2007 -0600
+++ b/tools/python/xen/xend/server/DevController.py Wed Apr 11 15:45:29
2007 +0100
@@ -223,6 +223,7 @@ class DevController:
xstransact.Remove(backpath)
xstransact.Remove(frontpath)
+ self.vm._removeVm("device/%s/%d" % (self.deviceClass, devid))
def configurations(self):
return map(self.configuration, self.deviceIDs())
diff -r 3d356a2b1c75 -r db4fcb609383 tools/python/xen/xend/server/netif.py
--- a/tools/python/xen/xend/server/netif.py Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/python/xen/xend/server/netif.py Wed Apr 11 15:45:29 2007 +0100
@@ -88,46 +88,6 @@ def parseRate(ratestr):
return "%lu,%lu" % (bytes_per_interval, interval_usecs)
-write_rate_G_re = re.compile('^([0-9]+)000000000(B/s@[0-9]+us)$')
-write_rate_M_re = re.compile('^([0-9]+)000000(B/s@[0-9]+us)$')
-write_rate_K_re = re.compile('^([0-9]+)000(B/s@[0-9]+us)$')
-write_rate_s_re = re.compile('^([0-9]+[GMK]?B/s@[0-9]+)000000us$')
-write_rate_m_re = re.compile('^([0-9]+[GMK]?B/s@[0-9]+)000us$')
-
-def formatRate(rate):
- (bytes_per_interval, interval_usecs) = map(long, rate.split(','))
-
- if interval_usecs != 0:
- bytes_per_second = (bytes_per_interval * 1000 * 1000) / interval_usecs
- else:
- bytes_per_second = 0xffffffffL
-
- ratestr = "%uB/s@%uus" % (bytes_per_second, interval_usecs)
-
- # look for '000's
- m = write_rate_G_re.match(ratestr)
- if m:
- ratestr = m.group(1) + "G" + m.group(2)
- else:
- m = write_rate_M_re.match(ratestr)
- if m:
- ratestr = m.group(1) + "M" + m.group(2)
- else:
- m = write_rate_K_re.match(ratestr)
- if m:
- ratestr = m.group(1) + "K" + m.group(2)
-
- m = write_rate_s_re.match(ratestr)
- if m:
- ratestr = m.group(1) + "s"
- else:
- m = write_rate_m_re.match(ratestr)
- if m:
- ratestr = m.group(1) + "ms"
-
- return ratestr
-
-
class NetifController(DevController):
"""Network interface controller. Handles all network devices for a domain.
"""
@@ -138,8 +98,7 @@ class NetifController(DevController):
def getDeviceDetails(self, config):
"""@see DevController.getDeviceDetails"""
- script = os.path.join(xoptions.network_script_dir,
- config.get('script', xoptions.get_vif_script()))
+ script = config.get('script', xoptions.get_vif_script())
typ = config.get('type')
bridge = config.get('bridge')
mac = config.get('mac')
@@ -149,24 +108,17 @@ class NetifController(DevController):
ipaddr = config.get('ip')
model = config.get('model')
- devid = self.allocateDeviceID()
-
if not typ:
typ = xoptions.netback_type
-
+
if not mac:
mac = randomMAC()
+ devid = self.allocateDeviceID()
+
back = { 'script' : script,
'mac' : mac,
- 'handle' : "%i" % devid,
'type' : typ }
-
- if typ == 'ioemu':
- front = {}
- else:
- front = { 'handle' : "%i" % devid,
- 'mac' : mac }
if ipaddr:
back['ip'] = ipaddr
if bridge:
@@ -174,12 +126,26 @@ class NetifController(DevController):
if vifname:
back['vifname'] = vifname
if rate:
- back['rate'] = parseRate(rate)
+ back['rate'] = rate
if uuid:
back['uuid'] = uuid
if model:
back['model'] = model
+ config_path = "device/%s/%d/" % (self.deviceClass, devid)
+ for x in back:
+ self.vm._writeVm(config_path + x, back[x])
+
+ back['handle'] = "%i" % devid
+ back['script'] = os.path.join(xoptions.network_script_dir, script)
+ if rate:
+ back['rate'] = parseRate(rate)
+
+ front = {}
+ if typ != 'ioemu':
+ front = { 'handle' : "%i" % devid,
+ 'mac' : mac }
+
return (devid, back, front)
@@ -187,14 +153,17 @@ class NetifController(DevController):
"""@see DevController.configuration"""
result = DevController.getDeviceConfiguration(self, devid)
- devinfo = self.readBackend(devid, 'script', 'ip', 'bridge',
- 'mac', 'type', 'vifname', 'rate',
- 'uuid', 'model')
+
+ config_path = "device/%s/%d/" % (self.deviceClass, devid)
+ devinfo = ()
+ for x in ( 'script', 'ip', 'bridge', 'mac',
+ 'type', 'vifname', 'rate', 'uuid', 'model' ):
+ y = self.vm._readVm(config_path + x)
+ devinfo += (y,)
(script, ip, bridge, mac, typ, vifname, rate, uuid, model) = devinfo
if script:
- network_script_dir = xoptions.network_script_dir + os.sep
- result['script'] = script.replace(network_script_dir, "")
+ result['script'] = script
if ip:
result['ip'] = ip
if bridge:
@@ -206,11 +175,10 @@ class NetifController(DevController):
if vifname:
result['vifname'] = vifname
if rate:
- result['rate'] = formatRate(rate)
+ result['rate'] = rate
if uuid:
result['uuid'] = uuid
if model:
result['model'] = model
return result
-
diff -r 3d356a2b1c75 -r db4fcb609383 tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c Wed Apr 11 07:30:02 2007 -0600
+++ b/tools/xcutils/xc_save.c Wed Apr 11 15:45:29 2007 +0100
@@ -174,12 +174,9 @@ main(int argc, char **argv)
max_f = atoi(argv[4]);
flags = atoi(argv[5]);
- if (flags & XCFLAGS_HVM)
- ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags,
- &suspend, &init_qemu_maps, &qemu_flip_buffer);
- else
- ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags,
- &suspend);
+ ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags,
+ &suspend, !!(flags & XCFLAGS_HVM),
+ &init_qemu_maps, &qemu_flip_buffer);
xc_interface_close(xc_fd);
diff -r 3d356a2b1c75 -r db4fcb609383
unmodified_drivers/linux-2.6/platform-pci/evtchn.c
--- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c Wed Apr 11
07:30:02 2007 -0600
+++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c Wed Apr 11
15:45:29 2007 +0100
@@ -28,8 +28,10 @@
* IN THE SOFTWARE.
*/
+#include <linux/config.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/spinlock.h>
#include <xen/evtchn.h>
#include <xen/interface/hvm/ioreq.h>
#include <xen/features.h>
@@ -41,29 +43,37 @@
void *shared_info_area;
-static DEFINE_MUTEX(irq_evtchn_mutex);
-
#define is_valid_evtchn(x) ((x) != 0)
#define evtchn_from_irq(x) (irq_evtchn[irq].evtchn)
static struct {
+ spinlock_t lock;
irqreturn_t(*handler) (int, void *, struct pt_regs *);
void *dev_id;
int evtchn;
int close:1; /* close on unbind_from_irqhandler()? */
int inuse:1;
+ int in_handler:1;
} irq_evtchn[256];
static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
[0 ... NR_EVENT_CHANNELS-1] = -1 };
-static int find_unbound_irq(void)
+static DEFINE_SPINLOCK(irq_alloc_lock);
+
+static int alloc_xen_irq(void)
{
static int warned;
int irq;
- for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++)
- if (!irq_evtchn[irq].inuse)
- return irq;
+ spin_lock(&irq_alloc_lock);
+
+ for (irq = 1; irq < ARRAY_SIZE(irq_evtchn); irq++) {
+ if (irq_evtchn[irq].inuse)
+ continue;
+ irq_evtchn[irq].inuse = 1;
+ spin_unlock(&irq_alloc_lock);
+ return irq;
+ }
if (!warned) {
warned = 1;
@@ -71,7 +81,16 @@ static int find_unbound_irq(void)
"increase irq_evtchn[] size in evtchn.c.\n");
}
+ spin_unlock(&irq_alloc_lock);
+
return -ENOSPC;
+}
+
+static void free_xen_irq(int irq)
+{
+ spin_lock(&irq_alloc_lock);
+ irq_evtchn[irq].inuse = 0;
+ spin_unlock(&irq_alloc_lock);
}
int irq_to_evtchn_port(int irq)
@@ -93,8 +112,7 @@ void unmask_evtchn(int port)
shared_info_t *s = shared_info_area;
vcpu_info_t *vcpu_info;
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
vcpu_info = &s->vcpu_info[cpu];
/* Slow path (hypercall) if this is a non-local port. We only
@@ -103,7 +121,7 @@ void unmask_evtchn(int port)
evtchn_unmask_t op = { .port = port };
(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask,
&op);
- preempt_enable();
+ put_cpu();
return;
}
@@ -121,7 +139,8 @@ void unmask_evtchn(int port)
if (!vcpu_info->evtchn_upcall_mask)
force_evtchn_callback();
}
- preempt_enable();
+
+ put_cpu();
}
EXPORT_SYMBOL(unmask_evtchn);
@@ -135,20 +154,19 @@ int bind_listening_port_to_irqhandler(
struct evtchn_alloc_unbound alloc_unbound;
int err, irq;
- mutex_lock(&irq_evtchn_mutex);
-
- irq = find_unbound_irq();
- if (irq < 0) {
- mutex_unlock(&irq_evtchn_mutex);
+ irq = alloc_xen_irq();
+ if (irq < 0)
return irq;
- }
+
+ spin_lock_irq(&irq_evtchn[irq].lock);
alloc_unbound.dom = DOMID_SELF;
alloc_unbound.remote_dom = remote_domain;
err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
&alloc_unbound);
if (err) {
- mutex_unlock(&irq_evtchn_mutex);
+ spin_unlock_irq(&irq_evtchn[irq].lock);
+ free_xen_irq(irq);
return err;
}
@@ -156,13 +174,13 @@ int bind_listening_port_to_irqhandler(
irq_evtchn[irq].dev_id = dev_id;
irq_evtchn[irq].evtchn = alloc_unbound.port;
irq_evtchn[irq].close = 1;
- irq_evtchn[irq].inuse = 1;
evtchn_to_irq[alloc_unbound.port] = irq;
unmask_evtchn(alloc_unbound.port);
- mutex_unlock(&irq_evtchn_mutex);
+ spin_unlock_irq(&irq_evtchn[irq].lock);
+
return irq;
}
EXPORT_SYMBOL(bind_listening_port_to_irqhandler);
@@ -176,34 +194,34 @@ int bind_caller_port_to_irqhandler(
{
int irq;
- mutex_lock(&irq_evtchn_mutex);
-
- irq = find_unbound_irq();
- if (irq < 0) {
- mutex_unlock(&irq_evtchn_mutex);
+ irq = alloc_xen_irq();
+ if (irq < 0)
return irq;
- }
+
+ spin_lock_irq(&irq_evtchn[irq].lock);
irq_evtchn[irq].handler = handler;
irq_evtchn[irq].dev_id = dev_id;
irq_evtchn[irq].evtchn = caller_port;
irq_evtchn[irq].close = 0;
- irq_evtchn[irq].inuse = 1;
evtchn_to_irq[caller_port] = irq;
unmask_evtchn(caller_port);
- mutex_unlock(&irq_evtchn_mutex);
+ spin_unlock_irq(&irq_evtchn[irq].lock);
+
return irq;
}
EXPORT_SYMBOL(bind_caller_port_to_irqhandler);
void unbind_from_irqhandler(unsigned int irq, void *dev_id)
{
- int evtchn = evtchn_from_irq(irq);
-
- mutex_lock(&irq_evtchn_mutex);
+ int evtchn;
+
+ spin_lock_irq(&irq_evtchn[irq].lock);
+
+ evtchn = evtchn_from_irq(irq);
if (is_valid_evtchn(evtchn)) {
evtchn_to_irq[irq] = -1;
@@ -216,21 +234,28 @@ void unbind_from_irqhandler(unsigned int
irq_evtchn[irq].handler = NULL;
irq_evtchn[irq].evtchn = 0;
- irq_evtchn[irq].inuse = 0;
-
- mutex_unlock(&irq_evtchn_mutex);
+
+ spin_unlock_irq(&irq_evtchn[irq].lock);
+
+ while (irq_evtchn[irq].in_handler)
+ cpu_relax();
+
+ free_xen_irq(irq);
}
EXPORT_SYMBOL(unbind_from_irqhandler);
void notify_remote_via_irq(int irq)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn;
+
+ evtchn = evtchn_from_irq(irq);
if (is_valid_evtchn(evtchn))
notify_remote_via_evtchn(evtchn);
}
EXPORT_SYMBOL(notify_remote_via_irq);
-irqreturn_t evtchn_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t evtchn_interrupt(int irq, void *dev_id,
+ struct pt_regs *regs)
{
unsigned int l1i, port;
/* XXX: All events are bound to vcpu0 but irq may be redirected. */
@@ -249,13 +274,30 @@ irqreturn_t evtchn_interrupt(int irq, vo
while ((l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i])) {
port = (l1i * BITS_PER_LONG) + __ffs(l2);
synch_clear_bit(port, &s->evtchn_pending[0]);
+
irq = evtchn_to_irq[port];
- if ((irq >= 0) &&
- ((handler = irq_evtchn[irq].handler) != NULL))
- handler(irq, irq_evtchn[irq].dev_id, regs);
- else
- printk(KERN_WARNING "unexpected event channel "
- "upcall on port %d!\n", port);
+ if (irq < 0)
+ continue;
+
+ spin_lock(&irq_evtchn[irq].lock);
+ handler = irq_evtchn[irq].handler;
+ dev_id = irq_evtchn[irq].dev_id;
+ if (unlikely(handler == NULL)) {
+ printk("Xen IRQ%d (port %d) has no handler!\n",
+ irq, port);
+ spin_unlock(&irq_evtchn[irq].lock);
+ continue;
+ }
+ irq_evtchn[irq].in_handler = 1;
+ spin_unlock(&irq_evtchn[irq].lock);
+
+ local_irq_enable();
+ handler(irq, irq_evtchn[irq].dev_id, regs);
+ local_irq_disable();
+
+ spin_lock(&irq_evtchn[irq].lock);
+ irq_evtchn[irq].in_handler = 0;
+ spin_unlock(&irq_evtchn[irq].lock);
}
}
@@ -267,16 +309,6 @@ void force_evtchn_callback(void)
(void)HYPERVISOR_xen_version(0, NULL);
}
EXPORT_SYMBOL(force_evtchn_callback);
-
-void irq_suspend(void)
-{
- mutex_lock(&irq_evtchn_mutex);
-}
-
-void irq_suspend_cancel(void)
-{
- mutex_unlock(&irq_evtchn_mutex);
-}
void irq_resume(void)
{
@@ -289,6 +321,16 @@ void irq_resume(void)
for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++)
irq_evtchn[irq].evtchn = 0;
-
- mutex_unlock(&irq_evtchn_mutex);
-}
+}
+
+int xen_irq_init(struct pci_dev *pdev)
+{
+ int irq;
+
+ for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++)
+ spin_lock_init(&irq_evtchn[irq].lock);
+
+ return request_irq(pdev->irq, evtchn_interrupt,
+ SA_SHIRQ | SA_SAMPLE_RANDOM | SA_INTERRUPT,
+ "xen-platform-pci", pdev);
+}
diff -r 3d356a2b1c75 -r db4fcb609383
unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c
--- a/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c Wed Apr
11 07:30:02 2007 -0600
+++ b/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c Wed Apr
11 15:45:29 2007 +0100
@@ -1,24 +1,81 @@
#include <linux/config.h>
+#include <linux/stop_machine.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
#include <xen/xenbus.h>
#include "platform-pci.h"
#include <asm/hypervisor.h>
-int __xen_suspend(int fast_suspend)
+/*
+ * Spinning prevents, for example, APs touching grant table entries while
+ * the shared grant table is not mapped into the address space imemdiately
+ * after resume.
+ */
+static void ap_suspend(void *_ap_spin)
+{
+ int *ap_spin = _ap_spin;
+
+ BUG_ON(!irqs_disabled());
+
+ while (*ap_spin) {
+ cpu_relax();
+ HYPERVISOR_yield();
+ }
+}
+
+static int bp_suspend(void)
{
int suspend_cancelled;
- xenbus_suspend();
- platform_pci_suspend();
+ BUG_ON(!irqs_disabled());
suspend_cancelled = HYPERVISOR_shutdown(SHUTDOWN_suspend);
- if (suspend_cancelled) {
- platform_pci_suspend_cancel();
+ if (!suspend_cancelled) {
+ platform_pci_resume();
+ gnttab_resume();
+ irq_resume();
+ }
+
+ return suspend_cancelled;
+}
+
+int __xen_suspend(int fast_suspend)
+{
+ int err, suspend_cancelled, ap_spin;
+
+ xenbus_suspend();
+
+ preempt_disable();
+
+ /* Prevent any races with evtchn_interrupt() handler. */
+ disable_irq(xen_platform_pdev->irq);
+
+ ap_spin = 1;
+ smp_mb();
+
+ err = smp_call_function(ap_suspend, &ap_spin, 0, 0);
+ if (err < 0) {
+ preempt_enable();
xenbus_suspend_cancel();
- } else {
- platform_pci_resume();
+ return err;
+ }
+
+ local_irq_disable();
+ suspend_cancelled = bp_suspend();
+ local_irq_enable();
+
+ smp_mb();
+ ap_spin = 0;
+
+ enable_irq(xen_platform_pdev->irq);
+
+ preempt_enable();
+
+ if (!suspend_cancelled)
xenbus_resume();
- }
+ else
+ xenbus_suspend_cancel();
return 0;
}
diff -r 3d356a2b1c75 -r db4fcb609383
unmodified_drivers/linux-2.6/platform-pci/platform-compat.c
--- a/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c Wed Apr
11 07:30:02 2007 -0600
+++ b/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c Wed Apr
11 15:45:29 2007 +0100
@@ -12,11 +12,10 @@ EXPORT_SYMBOL(system_state);
EXPORT_SYMBOL(system_state);
#endif
-static inline void ctrl_alt_del(void)
+void ctrl_alt_del(void)
{
kill_proc(1, SIGINT, 1); /* interrupt init */
}
-EXPORT_SYMBOL(ctrl_alt_del);
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
size_t strcspn(const char *s, const char *reject)
diff -r 3d356a2b1c75 -r db4fcb609383
unmodified_drivers/linux-2.6/platform-pci/platform-pci.c
--- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c Wed Apr 11
07:30:02 2007 -0600
+++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c Wed Apr 11
15:45:29 2007 +0100
@@ -40,7 +40,6 @@
#include <xen/interface/hvm/params.h>
#include <xen/features.h>
#include <xen/evtchn.h>
-#include <xen/gnttab.h>
#ifdef __ia64__
#include <asm/xen/xencomm.h>
#endif
@@ -61,6 +60,8 @@ MODULE_AUTHOR("ssmith@xxxxxxxxxxxxx");
MODULE_AUTHOR("ssmith@xxxxxxxxxxxxx");
MODULE_DESCRIPTION("Xen platform PCI device");
MODULE_LICENSE("GPL");
+
+struct pci_dev *xen_platform_pdev;
static unsigned long shared_info_frame;
static uint64_t callback_via;
@@ -88,8 +89,6 @@ static int __devinit init_xen_info(void)
ioremap(shared_info_frame << PAGE_SHIFT, PAGE_SIZE);
if (shared_info_area == NULL)
panic("can't map shared info\n");
-
- gnttab_init();
return 0;
}
@@ -199,8 +198,10 @@ static int set_callback_via(uint64_t via
return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
}
+int xen_irq_init(struct pci_dev *pdev);
int xenbus_init(void);
int xen_reboot_init(void);
+int gnttab_init(void);
static int __devinit platform_pci_init(struct pci_dev *pdev,
const struct pci_device_id *ent)
@@ -208,6 +209,10 @@ static int __devinit platform_pci_init(s
int i, ret;
long ioaddr, iolen;
long mmio_addr, mmio_len;
+
+ if (xen_platform_pdev)
+ return -EBUSY;
+ xen_platform_pdev = pdev;
i = pci_enable_device(pdev);
if (i)
@@ -249,9 +254,10 @@ static int __devinit platform_pci_init(s
if ((ret = init_xen_info()))
goto out;
- if ((ret = request_irq(pdev->irq, evtchn_interrupt,
- SA_SHIRQ | SA_SAMPLE_RANDOM,
- "xen-platform-pci", pdev)))
+ if ((ret = gnttab_init()))
+ goto out;
+
+ if ((ret = xen_irq_init(pdev)))
goto out;
if ((ret = set_callback_via(callback_via)))
@@ -291,18 +297,6 @@ static struct pci_driver platform_driver
};
static int pci_device_registered;
-
-void platform_pci_suspend(void)
-{
- gnttab_suspend();
- irq_suspend();
-}
-
-void platform_pci_suspend_cancel(void)
-{
- irq_suspend_cancel();
- gnttab_resume();
-}
void platform_pci_resume(void)
{
@@ -319,12 +313,8 @@ void platform_pci_resume(void)
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
- irq_resume();
-
if (set_callback_via(callback_via))
printk("platform_pci_resume failure!\n");
-
- gnttab_resume();
}
static int __init platform_pci_module_init(void)
diff -r 3d356a2b1c75 -r db4fcb609383
unmodified_drivers/linux-2.6/platform-pci/platform-pci.h
--- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h Wed Apr 11
07:30:02 2007 -0600
+++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h Wed Apr 11
15:45:29 2007 +0100
@@ -22,16 +22,11 @@
#ifndef _XEN_PLATFORM_PCI_H
#define _XEN_PLATFORM_PCI_H
-#include <linux/interrupt.h>
+#include <linux/pci.h>
unsigned long alloc_xen_mmio(unsigned long len);
-int gnttab_init(void);
-irqreturn_t evtchn_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-void irq_suspend(void);
-void irq_suspend_cancel(void);
-
-void platform_pci_suspend(void);
-void platform_pci_suspend_cancel(void);
void platform_pci_resume(void);
+extern struct pci_dev *xen_platform_pdev;
+
#endif /* _XEN_PLATFORM_PCI_H */
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|