| Hello,
I've been working on VM snapshots/CoW for Xen (as seen at Xen Summit
this year). I'm happy to release my first version.
There are some known issues with capturing page dirties for HVM guests.
I think it's related to the QEMU code (I believe I'm not catching the
pages that QEMU dirties). However, it works with both 32-bit and 64-bit
PV guests.
This release includes modifications to Linux, Xen, and some tools (a
library, FUSE fs, and testing tool).
The FUSE file system will take a snapshot if you try to create a file in
the directory you mount the xencowfs file system too.
The testing tool pauses a domain, enables CoW, takes a dump of the CoW
image, takes two live memory dumps, unpauses the domain for a bit, then
takes another CoW image. It compares all the images and reports on which
pages are different.
These patches are against xen-unstable revision 19425, however I had no
trouble patching them against the current revision of xen-unstable (19553).
Please remember this is an alpha release, so there is likely to be some
problems. Please let me know if you find any!
Patrick
 diff -r 832aac894efd drivers/xen/Kconfig
--- a/drivers/xen/Kconfig       Wed Nov 19 13:15:46 2008 +0000
+++ b/drivers/xen/Kconfig       Mon Mar 16 00:01:12 2009 -0700
@@ -312,4 +312,7 @@
 config XEN_DEVMEM
        def_bool y
 
+config XEN_XENCOW
+       def_bool y
+
 endif
diff -r 832aac894efd drivers/xen/Makefile
--- a/drivers/xen/Makefile      Wed Nov 19 13:15:46 2008 +0000
+++ b/drivers/xen/Makefile      Mon Mar 16 00:01:12 2009 -0700
@@ -23,3 +23,4 @@
 obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL)                += sfc_netutil/
 obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND)    += sfc_netfront/
 obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND)     += sfc_netback/
+obj-$(CONFIG_XEN_XENCOW)       += xencow/
diff -r 832aac894efd drivers/xen/xencow/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/xen/xencow/Makefile       Mon Mar 16 00:01:12 2009 -0700
@@ -0,0 +1,2 @@
+
+obj-m := xencow.o
diff -r 832aac894efd drivers/xen/xencow/common.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/xen/xencow/common.h       Mon Mar 16 00:01:12 2009 -0700
@@ -0,0 +1,74 @@
+/******************************************************************************
+ * common.h
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_XENCOW_COMMON_H__
+#define __XEN_XENCOW_COMMON_H__
+
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <xen/interface/platform.h>
+#include <xen/driver_util.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/cdev.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/xencow.h>
+
+
+#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
+                                    __FILE__ , __LINE__ , ## _a )
+
+#define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_cow: " fmt, ##args)
+
+
+#endif /* __XEN_XENCOW_COMMON_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 832aac894efd drivers/xen/xencow/xencow.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/xen/xencow/xencow.c       Mon Mar 16 00:01:12 2009 -0700
@@ -0,0 +1,246 @@
+/******************************************************************************
+ * xencow.c
+ *
+ * Xen Copy-on-Write Kernel Driver - Initialises CoW buffer for userspace
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+#include "common.h"
+
+
+static int xencow_major;
+
+
+static void xencow_release_user_pages(struct page *pages[], int num)
+{
+    int i;
+
+    for ( i = 0; i < num; i++ )
+    {
+        struct page *page = pages[i];
+        SetPageUptodate(page);
+        put_page(page);
+    }
+}
+
+static int xencow_get_user_pages(unsigned long addr,
+                                 int num,
+                                 struct page *pages[])
+{
+    int ret;
+
+    down_read(¤t->mm->mmap_sem);
+    ret = get_user_pages(current, current->mm, addr, num, 0, 0, pages, NULL);
+    up_read(¤t->mm->mmap_sem);
+
+    if ( ret != num )
+    {
+        if ( ret >= 0 )
+        {
+            xencow_release_user_pages(pages, ret);
+            ret = -E2BIG;
+        }
+
+        return ret;
+    }
+
+    return 0;
+}
+
+static inline unsigned long xencow_page_to_mfn(struct page *page)
+{
+    unsigned long pfn;
+    unsigned long mfn;
+
+    pfn = page_to_pfn(page);
+    mfn = pfn_to_mfn(pfn);
+
+    return mfn;
+}
+
+static int xencow_get_page_mfns(unsigned long addr,
+                                int num,
+                                unsigned long mfns[])
+{
+    struct page *pages[num];
+    unsigned long mfn;
+    int ret;
+    int i;
+
+    /* Get user pages */
+    ret = xencow_get_user_pages(addr, num, pages);
+    if ( ret != 0 )
+        return ret;
+
+    /* Get MFNs for the pages */
+    for ( i = 0; i < num; i++ )
+    {
+        mfn = xencow_page_to_mfn(pages[i]);
+        if ( mfn == 0 )
+            return -EFAULT;
+
+        mfns[i] = mfn;
+    }
+
+    /* Return user pages */
+    xencow_release_user_pages(pages, num);
+
+    return 0;
+}
+
+static int xencow_ioctl(struct inode *inode, struct file *filp,
+                        unsigned int cmd, unsigned long arg)
+{
+    int ret = 0;
+
+    switch ( cmd )
+    {
+
+    case XEN_COW_IOCTL_INIT:
+    {
+        xencow_init_t *cow_init;
+        xencow_init_t __user *cow_init_u;
+        int num_mfns;
+        int i;
+
+        cow_init_u = (xencow_init_t __user *)arg;
+
+        /* Check access on user init struct */
+        ret = -EFAULT;
+        if ( !access_ok(VERIFY_READ, cow_init_u, sizeof(xencow_init_t)) )
+            break;
+
+        /* Get the number of frames in the buffer */
+        ret = __get_user(num_mfns, &cow_init_u->num_mfns);
+        if ( ret != 0 )
+            break;
+
+        /* Allocate space */
+        ret = -ENOMEM;
+        cow_init = (xencow_init_t *)
+                   kmalloc(sizeof(xencow_init_t)
+                           + (num_mfns * sizeof(unsigned long)),
+                           GFP_KERNEL);
+        if ( cow_init == NULL )
+            break;
+
+        cow_init->num_mfns = num_mfns;
+
+        /* Get start address of buffer */
+        ret = __get_user(cow_init->addr, &cow_init_u->addr);
+        if ( ret != 0 )
+            goto init_out;
+
+        /* Get page buffer MFNs */
+        ret = xencow_get_page_mfns(cow_init->addr,
+                                   cow_init->num_mfns,
+                                   cow_init->mfns);
+        if ( ret != 0 )
+            goto init_out;
+
+        /* Check access on user page buffer MFNs array */
+        ret = -EFAULT;
+        if ( !access_ok(VERIFY_WRITE, &cow_init_u->mfns,
+                        cow_init->num_mfns * sizeof(unsigned long)) )
+            goto init_out;
+
+        /* Send page buffer MFNs to user */
+        ret = 0;
+        for ( i = 0; i < cow_init->num_mfns; i++ )
+            ret |= __put_user(cow_init->mfns[i], &cow_init_u->mfns[i]);
+
+    init_out:
+        kfree(cow_init);
+    }
+    break;
+
+    default:
+           ret = -ENOTTY;
+           break;
+    }
+
+    return ret;
+}
+
+static const struct file_operations xencow_fops = {
+    .owner   = THIS_MODULE,
+    .ioctl   = xencow_ioctl,
+};
+
+static int __init xencow_init(void)
+{
+    int ret;
+    struct class *class;
+
+    if ( !is_running_on_xen() )
+        return -ENODEV;
+
+    ret = register_chrdev(0, "xencow", &xencow_fops);
+    if ( ret < 0 )
+    {
+        WPRINTK("Couldn't register /dev/xen/xencow\n");
+        return ret;
+    }
+
+    xencow_major = ret;
+
+    DPRINTK("Created misc_dev [/dev/xen/xencow%d]\n", xencow_major);
+
+    /* Make sure the xen class exists */
+    class = get_xen_class();
+    if ( class != NULL )
+        class_device_create(class, NULL, MKDEV(xencow_major, 0),
+                            NULL, "xencow0");
+    else
+        /* This is bad, but not fatal */
+        WPRINTK("sysfs xen_class not created\n");
+
+    DPRINTK("XenCoW device successfully created\n");
+
+    return 0;
+}
+
+static void __exit xencow_exit(void)
+{
+    int ret;
+
+    ret = unregister_chrdev(xencow_major, "xencow");
+    if ( ret < 0 )
+    {
+        WPRINTK("Error: Couldn't unregister /dev/xen/xencow: %d\n", ret);
+        return;
+    }
+
+    DPRINTK("XenCoW device successfully removed\n");
+}
+
+module_init(xencow_init);
+module_exit(xencow_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff -r 832aac894efd include/xen/interface/io/xencow.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/include/xen/interface/io/xencow.h Mon Mar 16 00:01:12 2009 -0700
@@ -0,0 +1,70 @@
+/*****************************************************************************
+ * xencow.h
+ *
+ * XenCoW Common Structures
+ *
+ * Copyright (C) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#ifndef _XEN_PUBLIC_IO_XENCOW_H
+#define _XEN_PUBLIC_IO_XENCOW_H
+
+
+#include "ring.h"
+
+
+#define XEN_COW_IOC_MAGIC   'w'
+#define XEN_COW_IOCTL_INIT  _IO(XEN_COW_IOC_MAGIC, 1)
+
+
+/* Some definitions for the XenCow ring buffer. */
+typedef struct xencow_request_st {
+   ulong mfn;
+} xencow_request_t;
+
+typedef struct xencow_response_st {
+   ulong pfn;
+} xencow_response_t;
+
+DEFINE_RING_TYPES(xencow, xencow_request_t, xencow_response_t);
+
+
+/* The structure used to initialise a XenCoW snapshot. */
+typedef struct xencow_init_st {
+    /* Start address of buffer */
+    unsigned long addr;
+    /* Number of frames in buffer */
+    int num_mfns;
+    /* MFNs of buffer frames */
+    unsigned long mfns[];
+} xencow_init_t;
+
+
+
+#endif /* _XEN_PUBLIC_IO_XENCOW_H */
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/Makefile
--- a/tools/Makefile    Fri Mar 20 17:42:46 2009 +0000
+++ b/tools/Makefile    Mon Apr 20 10:21:49 2009 -0700
@@ -26,6 +26,7 @@
 SUBDIRS-$(CONFIG_Linux) += fs-back
 SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir
 SUBDIRS-y += xenpmd
+SUBDIRS-y += xencow
 
 # These don't cross-compile
 ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
diff -r 0477f9061c8a tools/xencow/COPYING
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/COPYING      Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,340 @@
+                   GNU GENERAL PUBLIC LICENSE
+                      Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                           Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                   GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                           NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                    END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff -r 0477f9061c8a tools/xencow/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/Makefile     Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,10 @@
+XEN_ROOT=../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+SUBDIRS-y :=
+SUBDIRS-y += lib
+SUBDIRS-y += xencowfs
+SUBDIRS-y += test
+
+.PHONY: all clean install
+all install clean: %: subdirs-%
diff -r 0477f9061c8a tools/xencow/README
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/README       Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,19 @@
+Xen Copy on Write
+-----------------------
+Provide copy on write functionality for the memory of Xen domains.
+
+
+
+
+Usage Notes and issues
+----------------------
+
+
+Future Work
+-----------
+
+Authors
+-------
+Chris Matthews         <cmatthew@xxxxxxxxxx>
+Geoffrey Lifebvre      <geoffrey@xxxxxxxxx>
+Brendan Cully          <brendan@xxxxxxxxx>
diff -r 0477f9061c8a tools/xencow/lib/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/Makefile Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,66 @@
+XEN_ROOT=../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+MAKE_LINK=ln -sf
+
+MAJOR    = 0
+MINOR    = 0
+SONAME   = libxencow.so.$(MAJOR)
+
+CFLAGS   += -I $(XEN_XC)
+CFLAGS   += -I ./
+CFLAGS   += $(CFLAGS_libxenctrl)
+LDFLAGS  += $(LDFLAGS_libxenctrl)
+
+SRCS     :=
+SRCS     += xc.c xencow.c
+
+CFLAGS   += -Werror
+CFLAGS   += -Wno-unused
+CFLAGS   += -fPIC
+CFLAGS   += -g
+
+CTRL_LIB_OBJS := $(patsubst %.c,%.o,$(CTRL_SRCS-y))
+CTRL_PIC_OBJS += $(patsubst %.c,%.opic,$(CTRL_SRCS-y))
+
+# Get gcc to generate the dependencies for us.
+CFLAGS   += -Wp,-MD,.$(@F).d
+DEPS     = .*.d
+
+OBJS     = $(SRCS:.c=.o)
+OBJS_PIC = $(SRCS:.c=.opic)
+IBINS    :=
+
+LIB      = libxencow.a libxencow.so.$(MAJOR).$(MINOR)
+
+.PHONY: all
+all: $(LIB)
+
+.PHONY: install
+install: all
+       $(INSTALL_DIR) $(DESTDIR)$(LIBDIR)
+       $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)
+       $(INSTALL_DATA) $(LIB) $(DESTDIR)$(LIBDIR)
+       $(MAKE_LINK) libxencow.so.$(MAJOR).$(MINOR) 
$(DESTDIR)$(LIBDIR)/libxencow.so.$(MAJOR)
+       $(MAKE_LINK) libxencow.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxencow.so
+       $(INSTALL_DATA) xencow.h $(DESTDIR)$(INCLUDEDIR)
+       $(INSTALL_DATA) xencow_list.h $(DESTDIR)$(INCLUDEDIR)
+
+.PHONY: clean
+clean:
+       rm -rf *.a *.so* *.o *.opic $(LIB) *~ $(DEPS) xen TAGS
+
+libxencow.so.$(MAJOR).$(MINOR): $(OBJS_PIC)
+       $(CC) $(CFLAGS) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) 
-Wl,libxencow.so.$(MAJOR) $(SHLIB_CFLAGS) -o $@ $^
+       $(MAKE_LINK) libxencow.so.$(MAJOR).$(MINOR) libxencow.so.$(MAJOR)
+       $(MAKE_LINK) libxencow.so.$(MAJOR) libxencow.so
+
+libxencow.a: $(OBJS)
+       $(AR) rcs $@ $^
+
+.PHONY: TAGS
+TAGS:
+       etags -t $(SRCS) *.h
+
+-include $(DEPS)
+
diff -r 0477f9061c8a tools/xencow/lib/xc.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xc.c     Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,357 @@
+/******************************************************************************
+ * tools/xencow/lib/xc.c
+ *
+ * libxc refactorisation. This should be put in libxc ultimately.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#include <xg_private.h>
+#include <xg_save_restore.h>
+#include "xc.h"
+
+
+/*
+ * Returns TRUE if the given machine frame number has a unique mapping
+ * in the guest's pseudophysical map.
+ */
+#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)                                        \
+    (((_mfn) < (max_mfn)) &&                                                  \
+     ((mfn_to_pfn(_mfn) < (p2m_size)) &&                                      \
+      (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
+
+
+xen_pfn_t *xc_map_m2p(int xc_handle, unsigned long max_mfn, int prot,
+                      unsigned long *m2p_mfn0)
+{
+    struct xen_machphys_mfn_list xmml;
+    privcmd_mmap_entry_t *entries;
+    unsigned long m2p_chunks;
+    unsigned long m2p_size;
+    xen_pfn_t *m2p;
+    xen_pfn_t *extent_start;
+    int i;
+
+    m2p = NULL;
+    m2p_size   = M2P_SIZE(max_mfn);
+    m2p_chunks = M2P_CHUNKS(max_mfn);
+
+    xmml.max_extents = m2p_chunks;
+
+    extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
+    if ( !extent_start )
+    {
+        ERROR("failed to allocate space for m2p mfns");
+        goto err0;
+    }
+    set_xen_guest_handle(xmml.extent_start, extent_start);
+
+    if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml)
+         || (xmml.nr_extents != m2p_chunks) )
+    {
+        ERROR("xc_get_m2p_mfns");
+        goto err1;
+    }
+
+    entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
+    if (entries == NULL)
+    {
+        ERROR("failed to allocate space for mmap entries");
+        goto err1;
+    }
+
+    for ( i = 0; i < m2p_chunks; i++ )
+        entries[i].mfn = extent_start[i];
+
+    m2p = xc_map_foreign_ranges(xc_handle, DOMID_XEN, m2p_size, prot,
+                                M2P_CHUNK_SIZE, entries, m2p_chunks);
+    if (m2p == NULL)
+    {
+        ERROR("xc_mmap_foreign_ranges failed");
+        goto err2;
+    }
+
+    *m2p_mfn0 = entries[0].mfn;
+
+ err2:
+    free(entries);
+ err1:
+    free(extent_start);
+ err0:
+    return m2p;
+}
+
+/* During transfer (or in the state file), all page-table pages must be
+ * converted into a 'canonical' form where references to actual mfns
+ * are replaced with references to the corresponding pfns.
+ *
+ * This function performs the appropriate conversion, taking into account
+ * which entries do not require canonicalisation (in particular, those
+ * entries which map the virtual address reserved for the hypervisor). */
+int xc_canonicalise_pagetable(unsigned long type, unsigned long pfn,
+                              const void *spage, void *dpage,
+                              xen_pfn_t *live_p2m_table,
+                              xen_pfn_t *live_m2p_table, unsigned long 
m2p_mfn0,
+                              unsigned long p2m_size, unsigned long max_mfn,
+                              unsigned long hvirt_start, unsigned int 
pt_levels,
+                              unsigned int guest_width)
+{
+    uint64_t pte;
+    int pte_last;
+    int xen_start;
+    int xen_end;
+    int i;
+    int race = 0; 
+
+    /*
+     * We need to determine which entries in this page table hold
+     * reserved hypervisor mappings. This depends on the current
+     * page table type as well as the number of paging levels.
+     */
+    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
+
+    if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
+
+    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
+        xen_start = L3_PAGETABLE_ENTRIES_PAE;
+
+    /*
+     * In PAE only the L2 mapping the top 1GB contains Xen mappings.
+     * We can spot this by looking for the guest's mapping of the m2p.
+     * Guests must ensure that this check will fail for other L2s.
+     */
+    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+    {
+        int hstart;
+        uint64_t he;
+
+        hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+        he = ((const uint64_t *)spage)[hstart];
+
+        if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+        {
+            /* hvirt starts with xen stuff... */
+            xen_start = hstart;
+        }
+        else if ( hvirt_start != 0xf5800000 )
+        {
+            /* old L2s from before hole was shrunk... */
+            hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+            he = ((const uint64_t *)spage)[hstart];
+            if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+                xen_start = hstart;
+        }
+    }
+
+    if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
+    {
+        /*
+         * XXX SMH: should compute these from hvirt_start (which we have)
+         * and hvirt_end (which we don't)
+         */
+        xen_start = 256;
+        xen_end   = 272;
+    }
+    
+    /* Now iterate through the page table, canonicalising each PTE */
+    for ( i = 0; i < pte_last; i++ )
+    {
+        unsigned long pfn;
+        unsigned long mfn;
+
+        if ( pt_levels == 2 )
+            pte = ((const uint32_t*)spage)[i];
+        else
+            pte = ((const uint64_t*)spage)[i];
+
+        if ( (i >= xen_start) && (i < xen_end) )
+            pte = 0;
+
+        if ( pte & _PAGE_PRESENT )
+        {
+            mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
+            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
+            {
+                /*
+                 * This will happen if the type info is stale which
+                 * is quite feasible under live migration
+                 */
+                pfn = 0;  /* zap it - we'll retransmit this page later */
+                /*
+                 * XXX: We can't spot Xen mappings in compat-mode L2es 
+                 * from 64-bit tools, but the only thing in them is the
+                 * compat m2p, so we quietly zap them.  This doesn't
+                 * count as a race, so don't report it.
+                 */
+                if ( !(type == XEN_DOMCTL_PFINFO_L2TAB 
+                       && sizeof(unsigned long) > guest_width) )
+                     race = 1;  /* inform the caller; fatal if !live */ 
+            }
+            else
+                pfn = mfn_to_pfn(mfn);
+
+            pte &= ~MADDR_MASK_X86;
+            pte |= (uint64_t)pfn << PAGE_SHIFT;
+
+            /*
+             * PAE guest L3Es can contain these flags when running on
+             * a 64bit hypervisor. We zap these here to avoid any
+             * surprise at restore time...
+             */
+            if ( (pt_levels == 3)
+                 && (type == XEN_DOMCTL_PFINFO_L3TAB)
+                 && (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
+                pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
+        }
+
+        if ( pt_levels == 2 )
+            ((uint32_t*)dpage)[i] = pte;
+        else
+            ((uint64_t*)dpage)[i] = pte;
+    }
+
+    return race;
+}
+
+xen_pfn_t *xc_get_live_p2m_table(int xc_handle, domid_t domain_id,
+                                 unsigned long p2m_size,
+                                 unsigned int guest_width)
+{
+    xc_dominfo_t info;
+    shared_info_t *live_shared_info = NULL;
+    xen_pfn_t *live_p2m_frame_list_list = NULL;
+    xen_pfn_t *live_p2m_frame_list = NULL;
+    xen_pfn_t *p2m_frame_list_list = NULL;
+    xen_pfn_t *p2m_frame_list = NULL;
+    xen_pfn_t *live_p2m_table = NULL;
+    int i;
+    
+    /* Map the shared info frame */
+    if ( xc_domain_getinfo(xc_handle, domain_id, 1, &info) != 1 )
+    {
+        ERROR("could not get domain info");
+        goto out;
+    }
+    
+    live_shared_info = xc_map_foreign_range(xc_handle, domain_id,
+                                            PAGE_SIZE, PROT_READ,
+                                            info.shared_info_frame);
+    if ( live_shared_info == NULL )
+    {
+        ERROR("could not map live shared info");
+        goto out;
+    }
+
+    /* Get the p2m frame list list */
+    live_p2m_frame_list_list =
+        xc_map_foreign_range(xc_handle, domain_id, PAGE_SIZE, PROT_READ,
+                             
live_shared_info->arch.pfn_to_mfn_frame_list_list);
+    if ( live_p2m_frame_list_list == NULL )
+    {
+        ERROR("could not map live p2m frame list list");
+        goto out;
+    }
+
+    /* Get a local copy of the live_P2M_frame_list_list */
+    p2m_frame_list_list = malloc(PAGE_SIZE);
+    if ( !p2m_frame_list_list )
+    {
+        ERROR("could not allocate p2m_frame_list_list array");
+        goto out;
+    }
+    memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
+
+    /* Canonicalise guest's unsigned long vs ours */
+    if ( guest_width > sizeof(unsigned long) )
+        for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
+            if ( i < PAGE_SIZE/guest_width )
+                p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
+            else
+                p2m_frame_list_list[i] = 0;
+    else if ( guest_width < sizeof(unsigned long) )
+        for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
+            p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
+
+    /* Get the p2m frame list */
+    live_p2m_frame_list = xc_map_foreign_batch(xc_handle, domain_id, PROT_READ,
+                                               p2m_frame_list_list,
+                                               P2M_FLL_ENTRIES);
+    if ( live_p2m_frame_list == NULL )
+    {
+        ERROR("could not map live p2m frame list");
+        goto out;
+    }
+
+    /* Get a local copy of the live p2m frame_list */
+    p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE);
+    if ( !p2m_frame_list )
+    {
+        ERROR("could not allocate p2m frame list array");
+        goto out;
+    }
+    memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
+    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
+
+    /* Canonicalise guest's unsigned long vs ours */
+    if ( guest_width > sizeof(unsigned long) )
+        for ( i = 0; i < P2M_FL_ENTRIES; i++ )
+            p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
+    else if ( guest_width < sizeof(unsigned long) )
+        for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
+            p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
+
+    /* Get the p2m table */
+    live_p2m_table = xc_map_foreign_batch(xc_handle, domain_id, PROT_READ,
+                                          p2m_frame_list,
+                                          P2M_FL_ENTRIES);
+    if ( live_p2m_table == NULL )
+    {
+        ERROR("could not map live p2m table");
+        goto out;
+    }
+    
+ out:
+    if ( live_p2m_frame_list )
+        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
+
+    if ( live_p2m_frame_list_list )
+        munmap(live_p2m_frame_list_list, PAGE_SIZE);
+
+    if ( live_shared_info )
+        munmap(live_shared_info, PAGE_SIZE);
+
+    if ( p2m_frame_list )
+        free(p2m_frame_list);
+
+    if ( p2m_frame_list_list )
+        free(p2m_frame_list_list);
+
+    return live_p2m_table;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/lib/xc.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xc.h     Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * tools/xencow/lib/xc.h
+ *
+ * libxc refactorisation. This should be put in libxc ultimately.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#define mfn_to_pfn(_mfn)  (live_m2p_table[(_mfn)])
+
+#define pfn_to_mfn(_pfn)                                                      \
+  ((xen_pfn_t) (((guest_width)==8)                                            \
+                ? (((uint64_t *)live_p2m_table)[(_pfn)])                      \
+                : ((((uint32_t *)live_p2m_table)[(_pfn)]) == 0xffffffffU      \
+                   ? (-1UL) : (((uint32_t *)live_p2m_table)[(_pfn)]))))
+
+
+#if 0
+typedef struct xc_domain_st {
+    domid_t        domain_id
+    xen_pfn_t     *live_p2m_table;
+    xen_pfn_t     *live_m2p_table;
+    unsigned long  m2p_mfn0;
+    unsigned long  p2m_size;
+    unsigned long  max_mfn;
+    unsigned long  hvirt_start;
+    unsigned int   pt_levels;
+    unsigned int   guest_width;
+} xc_domain_t;
+#endif
+
+
+xen_pfn_t *xc_map_m2p(int xc_handle, unsigned long max_mfn, int prot,
+                      unsigned long *m2p_mfn0);
+
+int xc_canonicalise_pagetable(unsigned long type, unsigned long pfn,
+                              const void *spage, void *dpage,
+                              xen_pfn_t *live_p2m_table,
+                              xen_pfn_t *live_m2p_table, unsigned long 
m2p_mfn0,
+                              unsigned long p2m_size, unsigned long max_mfn,
+                              unsigned long hvirt_start, unsigned int 
pt_levels,
+                              unsigned int guest_width);
+
+xen_pfn_t *xc_get_live_p2m_table(int xc_handle, domid_t domain_id,
+                                 unsigned long p2m_size,
+                                 unsigned int guest_width);
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/lib/xencow.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xencow.c Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,1072 @@
+/******************************************************************************
+ * tools/xencow/lib/xencow.c
+ *
+ * VM memory Copy-on-Write library.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+
+#include <pthread.h>
+#include <signal.h>
+
+#include <xen/domctl.h>
+
+#include <xc_private.h>
+#include <xg_save_restore.h>
+
+#include "xc.h"
+#include "xencow.h"
+
+
+static int xencow_create_file(const char *filename)
+{
+    mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
+    int flags = O_CREAT | O_TRUNC | O_RDWR;
+    int fd;
+
+    fd = open(filename, flags, mode);
+    if ( fd < 0 )
+    {
+        ERROR("Error opening file %s", filename);
+        return -EIO;
+    }
+    close(fd);
+
+    return 0;
+}
+
+/* Send an ioctl to the xencow device. */
+static int xencow_send_ioctl(int cmd, unsigned long arg)
+{
+    int fd;
+    int ret;
+
+    ret = -EIO;
+    fd = open("/dev/xencow0", O_RDWR);
+    if ( fd < 0 )
+    {
+        ERROR("Failed to open xencow device (/dev/xencow0)");
+        goto out;
+    }
+
+    ret = ioctl(fd, cmd, arg);
+    if ( ret != 0 )
+        ERROR("Error during ioctl of xencow device");
+
+    close(fd);
+
+ out:
+    return ret;
+}
+
+static void xencow_free(xencow_t *cow)
+{
+    munlock(cow->buffer, BUFFER_SIZE);
+    free(cow->buffer);
+    free(cow->mfns);
+
+    if ( cow->live_p2m_table )
+        free(cow->live_p2m_table);
+}
+
+static int xencow_alloc_bitmap(unsigned long **bitmap, unsigned long 
bitmap_size)
+{
+    if ( *bitmap == NULL )
+    {
+        *bitmap = calloc(bitmap_size / BITS_PER_LONG, sizeof(unsigned long));
+        if ( *bitmap == NULL )
+            return -ENOMEM;
+    }
+
+    memset(*bitmap, 0, bitmap_size / 8);
+
+    return 0;
+}
+
+static void *xencow_handle_events(void *c)
+{
+    xencow_t *cow = (xencow_t *)c;
+
+    IPRINTF("Starting resume thread\n");
+
+    while (1)
+    {
+        int port = xencow_wait_for_event_or_timeout(cow, 10);
+        
+        if ( port == cow->buffer_port || port == -1 )
+        {
+            if ( port == cow->buffer_port )
+                DPRINTF("Got buffer event\n");
+            xencow_flush_buffer(cow);
+        }
+        else if ( port == cow->pause_port )
+        {
+            /* If it was a pause event, flush buffer and resume domain */
+            int rc;
+
+            DPRINTF("Got pause event\n");
+            
+            xencow_flush_buffer(cow);
+
+            rc = xencow_resume(cow);
+            if ( rc != 0 )
+                ERROR("Failed to resume domain");
+        }
+        else
+            ERROR("Unknown event");
+    }
+}
+
+static int xencow_start_thread(xencow_t *cow, void *(*__start_routine) (void 
*))
+{
+    pthread_t thread;
+    sigset_t oldset;
+    sigset_t newset;
+    int ret;
+
+    sigemptyset(&newset);
+    sigaddset(&newset, SIGTERM);
+    sigaddset(&newset, SIGINT);
+    sigaddset(&newset, SIGHUP);
+    sigaddset(&newset, SIGQUIT);
+    pthread_sigmask(SIG_BLOCK, &newset, &oldset);
+
+    ret = pthread_create(&thread, NULL, __start_routine, cow);
+    if ( ret != 0 )
+    {
+        ERROR("Failed to create thread");
+        return -EIO;
+    }
+
+    pthread_detach(thread);
+    pthread_sigmask(SIG_SETMASK, &oldset, NULL);
+
+    return 0;
+}
+
+static int xencow_init_buffer(xencow_t *cow)
+{
+    void *buffer;
+    cow_init_t *cow_init;
+    cow_request_t req;
+    RING_IDX req_prod;
+    int num_pages;
+    int i;
+    int ret;
+
+    DPRINTF("buffer size: %ld\n", BUFFER_SIZE);
+    
+    /* Allocated page aligned buffer */
+    ret = posix_memalign(&buffer, PAGE_SIZE, BUFFER_SIZE);
+    if ( ret != 0 )
+        goto out_alloc;
+
+    /* Lock buffer in memory so it can't be paged out */
+    ret = mlock(buffer, BUFFER_SIZE);
+    if ( ret != 0 )
+        goto out_lock;
+
+    cow->buffer = buffer;
+    cow->page_buffer = buffer + XEN_COW_RING_SIZE;
+
+    /* Initialise ring */
+    SHARED_RING_INIT((cow_sring_t *)cow->buffer);
+    FRONT_RING_INIT(&cow->front_ring, (cow_sring_t *)cow->buffer, 
XEN_COW_RING_SIZE);
+    
+    num_pages = XEN_COW_RING_PAGES + RING_SIZE(&cow->front_ring);
+
+    DPRINTF("number of ring entries: %u\n", RING_SIZE(&cow->front_ring));
+
+    /* Allocate memory for ioctl struct */
+    ret = -ENOMEM;
+    cow_init = malloc(sizeof(cow_init_t) + (sizeof(unsigned long) * 
num_pages));
+    if ( cow_init == NULL )
+        goto out_lock;
+
+    /* Initialise ioctl struct */
+    cow_init->addr = (unsigned long)(cow->buffer);
+    cow_init->num_mfns = num_pages;
+
+    /* Get MFNs */
+    ret = xencow_send_ioctl(XEN_COW_IOCTL_INIT, (unsigned long)cow_init);
+    if ( ret != 0 )
+        goto out;
+
+    /* Allocate memory for CoW struct */
+    cow->num_mfns = RING_SIZE(&cow->front_ring);
+    cow->mfns = calloc(cow->num_mfns, sizeof(unsigned long));
+
+    /* Copy MFNs */
+    cow->sring_mfn = cow_init->mfns[0];
+    memcpy(cow->mfns, &cow_init->mfns[XEN_COW_RING_PAGES],
+           sizeof(unsigned long) * cow->num_mfns);
+
+    /* Fill ring with page buffer MFNs */
+    req_prod = cow->front_ring.req_prod_pvt;
+    for ( i = 0; i < cow->num_mfns; i++ )
+    {
+        req.mfn = cow->mfns[i];
+        memcpy(RING_GET_REQUEST(&cow->front_ring, req_prod + i), &req,
+               sizeof(cow_request_t));
+    }
+
+    cow->front_ring.req_prod_pvt = req_prod + i;
+    RING_PUSH_REQUESTS(&cow->front_ring);
+
+    free(cow_init);
+    return 0;
+
+ out:
+    free(cow_init);
+ out_init:
+    munlock(buffer, BUFFER_SIZE);
+ out_lock:
+    free(buffer);
+ out_alloc:
+    return ret;
+}
+
+static int xencow_init_xen(xencow_t *cow)
+{
+    /* Open connection to Xen */
+    cow->xc_handle = xc_interface_open();
+    if ( cow->xc_handle < 0 )
+    {
+        ERROR("Failed to connect to Xen");
+        goto err;
+    }
+
+    /* Open event channel */
+    cow->xce_handle = xc_evtchn_open();
+    if ( cow->xce_handle < 0 )
+    {
+        ERROR("Failed to open event channel");
+        goto err;
+    }
+
+    /* Bind VIRQ ports for event notification */
+    cow->buffer_port = xc_evtchn_bind_virq(cow->xce_handle, VIRQ_COW_BUFFER);
+    if ( cow->buffer_port < 0 )
+    {
+        ERROR("Failed to bind VIRQ");
+        goto err;
+    }
+
+    cow->pause_port = xc_evtchn_bind_virq(cow->xce_handle, VIRQ_COW_PAUSE);
+    if ( cow->pause_port < 0 )
+    {
+        ERROR("Failed to bind VIRQ");
+        goto err;
+    }
+
+    return 0;
+
+ err:
+    return -EINVAL;
+}
+
+static int xencow_init_domain_info(xencow_t *cow)
+{
+    xc_dominfo_t info;
+    int rc;
+
+    /* Get HVM info */
+    rc = xc_domain_getinfo(cow->xc_handle, cow->domain_id, 1, &info);
+    if ( rc != 1 )
+    {
+        ERROR("Failed to get domain info");
+        goto err;
+    }
+    cow->is_hvm = info.hvm;
+
+    /* Get memory size */
+    cow->p2m_size = xc_memory_op(cow->xc_handle, XENMEM_maximum_gpfn,
+                                 &cow->domain_id) + 1;
+
+    /* Get platform info */
+    rc = get_platform_info(cow->xc_handle, cow->domain_id,
+                           &cow->platform_info.max_mfn,
+                           &cow->platform_info.hvirt_start,
+                           &cow->platform_info.pt_levels,
+                           &cow->platform_info.guest_width);
+    if ( rc != 1 )
+    {
+        ERROR("Failed to get platform info");
+        goto err;
+    }
+
+    return 0;
+
+ err:
+    return -EINVAL;
+}
+
+xencow_t *xencow_init(domid_t domid)
+{
+       xencow_t *cow;
+       int rc;
+
+    /* Initialise CoW struct */
+       cow = malloc(sizeof(xencow_t));
+       if ( cow == NULL)
+       {
+               errno = ENOMEM;
+           goto cow_out;
+       }
+
+       memset(cow, 0, sizeof(xencow_t));
+
+       cow->domain_id = domid;
+    cow->xc_handle = -1;
+    cow->xce_handle = -1;
+
+    INIT_LIST_HEAD(&cow->snapshots);
+
+    /* Initialise locks */
+    cow_ring_lock_init(cow);
+    cow_snapshots_lock_init(cow);
+
+    /* Initialise buffer */
+    IPRINTF("Initialising buffer\n");
+       rc = xencow_init_buffer(cow);
+       if ( rc != 0 )
+       {
+               ERROR("Failed to initialise buffer");
+           goto out;
+       }
+
+       /* Initialise connection to Xen */
+    rc = xencow_init_xen(cow);
+    if ( rc != 0 )
+    {
+        ERROR("Failed to initialise connection to Xen");
+        goto out;
+    }
+
+    /* Get domain info */
+    rc = xencow_init_domain_info(cow);
+    if ( rc != 0 )
+    {
+        ERROR("Failed to get domain info");
+        goto out;
+    }
+
+    /* Start event handler thread */
+    xencow_start_thread(cow, xencow_handle_events);    
+
+    return cow;
+
+ out:
+    xencow_free(cow);
+ cow_out:
+       return NULL;
+}
+
+static int xencow_open_snapshot_file_for_reading(xencow_snapshot_t *snapshot,
+                                                 unsigned long pfn)
+{
+    int open_flags = O_RDONLY;
+    mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH;
+    char *filename;
+
+    /* Open file */
+    if ( state_pfn(pfn) )
+        filename = snapshot->state_file;
+    else
+        filename = snapshot->backing_file;
+
+    return open(filename, open_flags, open_mode);
+}
+
+static int xencow_open_snapshot_file_for_writing(xencow_t *cow,
+                                                 xencow_snapshot_t **snapshot,
+                                                 RING_IDX now,
+                                                 unsigned long pfn)
+{
+    int open_flags = O_RDWR;
+    mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR | S_IWGRP | 
S_IWOTH;
+    char *filename;
+
+    /* Get the right snapshot to copy this page to */
+    list_for_each_entry_reverse ( (*snapshot), &cow->snapshots, list )
+        if ( (*snapshot)->when <= now )
+            break;
+
+    /* Open file */
+    if ( state_pfn(pfn) )
+        filename = (*snapshot)->state_file;
+    else
+        filename = (*snapshot)->backing_file;
+
+    return open(filename, open_flags, open_mode);
+}
+
+static int xencow_read_page(int fd, unsigned long pfn, void *buffer_page)
+{
+    off_t offset;
+    off_t seek_ret;
+    int total_read;
+    int ret;
+
+    offset = pfn_offset(pfn);
+
+    seek_ret = lseek64(fd, offset, SEEK_SET);
+#if 0
+    if ( ret < 0 )
+    {
+        ERROR("Error seeking: %ld (%lx)", (long)offset, offset_pfn(offset));
+        ret = -errno;
+        goto err;
+    }
+#endif
+
+    total_read = 0;
+    while ( total_read < PAGE_SIZE )
+    {
+        void *p = buffer_page + total_read;
+        int bytes_read = read(fd, p, PAGE_SIZE - total_read);
+        if ( bytes_read <= 0 )
+        {
+            ret = -errno;
+            goto err;
+        }
+
+        total_read += bytes_read;
+    }
+
+    return 0;
+
+ err:
+    ERROR("Read error");
+    return ret;
+}
+
+static int xencow_read_live_page(xencow_t *cow, unsigned long pfn, void 
*buffer_page)
+{
+    unsigned long mfn;
+    int ret;
+    
+    /* Get MFN */
+    mfn = xencow_p2m(cow, pfn);
+
+    /* Check if MFN is mapped */
+    if ( is_mapped(mfn) )
+    {
+        void *page = xc_map_foreign_batch(cow->xc_handle, cow->domain_id, 
PROT_READ, &mfn, 1);
+        int copy_frame = 0;
+
+        if ( cow->is_hvm )
+        {
+            if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB 
)
+                copy_frame = 1;
+        }
+        else
+        {
+            ((uint32_t *)(&mfn))[0] = mfn;
+
+            ret = xc_get_pfn_type_batch(cow->xc_handle, cow->domain_id, 1, 
(uint32_t *)(&mfn));
+            if ( ret != 0 )
+            {
+                ERROR("get_pfn_type_batch failed");
+                goto err;
+            }
+            mfn = (uint32_t)mfn;
+
+            if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB 
)
+            {
+                /* Canonicalise mfn -> pfn */
+                mfn = (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) | pfn;
+                copy_frame = 1;
+            }
+        }
+        
+        if ( copy_frame )
+        {
+            /* Copy live page */
+            if ( page != NULL )
+                memcpy(buffer_page, page, PAGE_SIZE);
+            else
+                memset(buffer_page, 0, PAGE_SIZE);
+        }
+        else
+            /* Copy blank page */
+            memset(buffer_page, 0, PAGE_SIZE);
+
+        munmap(page, PAGE_SIZE);
+    }
+    else
+        /* Copy blank page */
+        memset(buffer_page, 0, PAGE_SIZE);
+
+    return 0;
+
+ err:
+    return ret;
+}
+
+static int xencow_read_snapshot_page(xencow_snapshot_t *snapshot,
+                                     unsigned long pfn,
+                                     void *buffer_page)
+{
+    int ret;
+
+    if ( test_bit(pfn, snapshot->bitmap) )
+    {
+        /* Open file for reading */
+        int fd = xencow_open_snapshot_file_for_reading(snapshot, pfn);
+        if ( fd < 0 )
+        {
+            ERROR("Error opening file");
+            ret = -errno;
+            goto out;
+        }
+
+        /* Read file and close */
+        ret = xencow_read_page(fd, pfn, buffer_page);
+        close(fd);
+    }
+    else
+        ret = -ENOENT;
+
+ out:
+    return ret;
+}
+
+int xencow_read_buffer(xencow_t *cow, int snapshot_num, unsigned long 
start_pfn,
+                       int num_pages, void *buffer)
+{
+    int open_flags = O_RDONLY;
+    mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH;
+    xencow_snapshot_t *snapshot;
+    int current_num;
+    int fd;
+    int i;
+    int ret;
+
+    cow_snapshots_lock(cow);
+
+    current_num = 0;
+    list_for_each_entry ( snapshot, &cow->snapshots, list )
+    {
+        if ( current_num == snapshot_num )
+            break;
+        current_num++;
+    }
+
+    /* Open file */
+    fd = xencow_open_snapshot_file_for_reading(snapshot, start_pfn);
+    if ( fd < 0 )
+    {
+        ret = -errno;
+        goto out_open;
+    }
+
+    /* Read pages */
+    for ( i = 0; i < num_pages; i++ )
+    {
+        void *buffer_page = buffer + (i * PAGE_SIZE);
+        unsigned long pfn = start_pfn + i;
+
+        /* Check bitmap for page */
+        if ( test_bit(pfn, snapshot->bitmap) )
+        {
+            ret = xencow_read_page(fd, pfn, buffer_page);
+            if ( ret != 0 )
+                goto out;
+        }
+        else
+        {
+            int found = 0;
+
+            /* Check later snapshots */
+            list_for_each_entry_continue ( snapshot, &cow->snapshots, list )
+            {
+                ret = xencow_read_snapshot_page(snapshot, pfn, buffer_page);
+                if ( ret == 0 )
+                {
+                    found = 1;
+                    break;
+                }
+            }
+
+            if ( !found )
+            {
+                /* If not found, read page from live domain */
+                ret = xencow_read_live_page(cow, pfn, buffer_page);
+                if ( ret != 0 )
+                    goto out;
+
+                /* Flush buffer */
+                cow_snapshots_unlock(cow);
+                xencow_flush_buffer(cow);
+                cow_snapshots_lock(cow);
+    
+                /* Check (latest) bitmap for page again */
+                snapshot = list_bottom(&cow->snapshots, xencow_snapshot_t, 
list);
+
+                ret = xencow_read_snapshot_page(snapshot, pfn, buffer_page);
+                if ( ret == 0 )
+                    DPRINTF("Page dirtied since read from live\n");
+            }
+        }
+    }
+    
+    ret = 0;
+    
+ out:
+    close(fd);
+ out_open:
+    cow_snapshots_unlock(cow);
+    return ret;
+}
+
+static int xencow_write_page(int fd, off_t offset, void *page)
+{
+    int total_written;
+    int ret;
+    off_t seek_ret;
+
+    seek_ret = lseek64(fd, offset, SEEK_SET);
+#if 0
+    if ( ret < 0 )
+    {
+        ERROR("Error seeking: %ld (%lx)\n", (long)offset, offset_pfn(offset));
+        ret = -errno;
+        goto out;
+    }
+#endif
+
+    /* Write page */
+    total_written = 0;
+    while ( total_written < PAGE_SIZE )
+    {
+        void *p = page + total_written;
+        int bytes_written = write(fd, p, PAGE_SIZE - total_written);
+
+        DPRINTF("Writing first chunk: %lx\n", *((unsigned long *)p));
+
+        if ( bytes_written <= 0 )
+        {
+            ERROR("Error writing");
+            ret = -errno;
+            goto out;
+        }
+
+        total_written += bytes_written;
+
+        DPRINTF("Wrote %d bytes\n", bytes_written);
+    }
+
+    ret = 0;
+
+ out:
+    return ret;
+}
+
+static int xencow_flush_page(xencow_t *cow, xencow_snapshot_t *snapshot,
+                             RING_IDX now, int fd, unsigned long pfn,
+                             void *page)
+{
+    off_t offset;
+    int ret;
+
+    if ( state_pfn(pfn) )
+        offset = pfn_offset(now - snapshot->when);
+    else if ( !test_and_set_bit(pfn, snapshot->bitmap) )
+        offset = pfn_offset(pfn);
+    else
+        return -1; 
+
+    /* Write to file */
+    ret = xencow_write_page(fd, offset, page);
+    if ( ret != 0 && !state_pfn(pfn) )
+        clear_bit(pfn, snapshot->bitmap);
+
+    return ret;
+}
+
+#define BATCH_REQS 1
+
+void xencow_flush_buffer(xencow_t *cow)
+{
+    cow_request_t req;
+    RING_IDX req_prod;
+    RING_IDX rsp_prod;
+    RING_IDX i;
+    xencow_snapshot_t *snapshot = NULL;
+    char *filename;
+    int fd = -1;
+    int fd_is_state = 0;
+
+    if ( cow->next_snapshot == 0 )
+        return;
+
+    cow_ring_lock(cow);
+
+    rsp_prod = cow->front_ring.sring->rsp_prod;
+    req_prod = cow->front_ring.sring->req_prod;
+
+    /* Flush buffer pages */
+    for ( i = cow->front_ring.rsp_cons; i != rsp_prod; i++ )
+    {
+        cow_response_t rsp;
+        void *page = cow->page_buffer
+                     + (RING_MASK(&cow->front_ring, i) << PAGE_SHIFT);
+
+        memcpy(&rsp, RING_GET_RESPONSE(&cow->front_ring, i),
+               sizeof(cow_response_t));
+
+        DPRINTF("num: %lx; pfn: %lx; page first chunk: %lx\n",
+                (unsigned long)i, rsp.pfn, *((unsigned long *)page));
+
+        /* Open appropriate file */
+        cow_snapshots_lock(cow);
+
+        if ( (fd < 0) || 
+             (state_pfn(rsp.pfn) && !fd_is_state) ||
+             (!state_pfn(rsp.pfn) && fd_is_state))
+        {
+            if ( fd >= 0 )
+                close(fd);
+            
+            fd = xencow_open_snapshot_file_for_writing(cow, &snapshot, i, 
rsp.pfn);
+            if ( fd < 0 )
+            {
+                ERROR("Error opening file");
+                return;
+            }
+
+            fd_is_state = state_pfn(rsp.pfn);
+            if ( fd_is_state )
+                DPRINTF("state page: %d\n", i);
+            else
+                DPRINTF("normal page: %d\n", i);
+        }
+
+        cow_snapshots_unlock(cow);
+
+        /* Flush buffer page */
+        xencow_flush_page(cow, snapshot, i, fd, rsp.pfn, page);
+
+#if !BATCH_REQS
+        cow->front_ring.rsp_cons = i + 1;
+        cow->front_ring.sring->rsp_event = i + 2;
+#endif
+
+        /* Put buffer page MFN in ring */
+        req.mfn = cow->mfns[RING_MASK(&cow->front_ring, i)];
+        memcpy(RING_GET_REQUEST(&cow->front_ring, req_prod), &req,
+               sizeof(cow_request_t));
+        req_prod++;
+
+#if !BATCH_REQS
+        /* Push added MFN out */
+        cow->front_ring.req_prod_pvt = req_prod;
+        RING_PUSH_REQUESTS(&cow->front_ring);
+#endif
+    }
+
+    if ( fd >= 0 )
+        close(fd);
+    
+#if BATCH_REQS
+    cow->front_ring.rsp_cons = i;
+    cow->front_ring.sring->rsp_event = i + 1;
+
+    /* Push added MFNs out */
+    cow->front_ring.req_prod_pvt = req_prod;
+    RING_PUSH_REQUESTS(&cow->front_ring);
+#endif
+
+    cow_ring_unlock(cow);
+}
+
+int xencow_resume(xencow_t *cow)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_cow_resume;
+    domctl.domain = cow->domain_id;
+
+    return do_domctl(cow->xc_handle, &domctl);
+}
+
+int xencow_enable(xencow_t *cow)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_cow_enable;
+    domctl.domain = cow->domain_id;
+    domctl.u.cow_enable.mfn = cow->sring_mfn;
+
+    return do_domctl(cow->xc_handle, &domctl);
+}
+
+int xencow_disable(xencow_t *cow)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_cow_disable;
+    domctl.domain = cow->domain_id;
+
+    return do_domctl(cow->xc_handle, &domctl);
+}
+
+unsigned long xencow_p2m(xencow_t *cow, unsigned long pfn)
+{
+    unsigned long mnf;
+    unsigned long *live_p2m_table;
+
+    if ( cow->is_hvm )
+        return pfn;
+
+    if ( cow->live_p2m_table == NULL )
+        cow->live_p2m_table =
+            xc_get_live_p2m_table(cow->xc_handle, cow->domain_id, 
cow->p2m_size,
+                                  cow->platform_info.guest_width);
+
+    return cow->live_p2m_table[pfn];
+}
+
+static int xencow_create_snapshot_files(xencow_t *cow, xencow_snapshot_t 
*snapshot)
+{
+    char *fuse_file = malloc(8 * sizeof(int) + 4);
+    char *backing_file = malloc(200);
+    char *state_file = malloc(200);
+    int ret;
+
+    /* FIXME: Don't hardcode the path */
+    /* Get file names */
+    sprintf(fuse_file, "%d.%d", cow->domain_id, cow->next_snapshot);
+    sprintf(backing_file, "/tmp/xencow%s", fuse_file);
+    sprintf(state_file, "%s.state", backing_file);
+
+    DPRINTF("fuse: %s; backing: %s; state: %s\n", fuse_file, backing_file, 
state_file);
+
+    /* Create backing files */
+    ret = xencow_create_file(backing_file);
+    if ( ret != 0 )
+        return ret;
+
+    ret = xencow_create_file(state_file);
+    if ( ret != 0 )
+        return ret;
+
+    /* Store file names */
+    snapshot->xencowfs_file = malloc(strlen(fuse_file) + 1);
+    strncpy(snapshot->xencowfs_file, fuse_file, strlen(fuse_file) + 1);
+
+    snapshot->backing_file = malloc(strlen(backing_file) + 1);
+    strncpy(snapshot->backing_file, backing_file, strlen(backing_file) + 1);
+
+    snapshot->state_file = malloc(strlen(state_file) + 1);
+    strncpy(snapshot->state_file, state_file, strlen(state_file) + 1);
+
+    return 0;
+}
+
+static int xencow_init_snapshot(xencow_t *cow)
+{
+    xencow_snapshot_t *snapshot = malloc(sizeof(xencow_snapshot_t));
+    int ret;
+
+    memset(snapshot, 0, sizeof(xencow_snapshot_t));
+
+    ret = xencow_alloc_bitmap(&snapshot->bitmap, cow->p2m_size);
+    if ( ret != 0 )
+    {
+        ERROR("Error allocating bitmap");
+        return ret;
+    }
+
+    ret = xencow_create_snapshot_files(cow, snapshot);
+    if ( ret != 0 )
+    {
+        ERROR("Error creating backing files");
+        return ret;
+    }
+
+    list_add_tail(&snapshot->list, &cow->snapshots);
+
+    return 0;
+}
+
+static int xencow_take_snapshot(xencow_t *cow)
+{
+    struct timeval before;
+    struct timeval after;
+    double time_diff;
+    xencow_snapshot_t *snapshot;
+    int ret;
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_cow_snapshot;
+    domctl.domain = cow->domain_id;
+
+    gettimeofday(&before, NULL);
+    
+    ret = do_domctl(cow->xc_handle, &domctl);
+    if ( ret != 0 )
+    {
+        ERROR("Error taking snapshot");
+        return ret;
+    }
+
+    gettimeofday(&after, NULL);
+
+    time_diff = difftime(after.tv_usec, before.tv_usec);
+    IPRINTF("Time spent paused: %fus\n", time_diff);
+
+    ret = xencow_init_snapshot(cow);
+    if ( ret != 0 )
+    {
+        ERROR("Error initialising snapshot");
+        return ret;
+    }
+
+    snapshot = list_bottom(&cow->snapshots, xencow_snapshot_t, list);
+    snapshot->when = domctl.u.cow_snapshot.when;
+
+    cow->next_snapshot++;
+
+    DPRINTF("when = %d\n", snapshot->when);
+
+    return ret;
+}
+
+void xencow_cleanup(xencow_t *cow)
+{
+    /* Disable CoW */
+       xencow_disable(cow);
+       
+       /* Reset variables */
+       cow->p2m_size = 0;
+    cow->num_mfns = 0;
+    cow->sring_mfn = 0;
+       
+    /* Close event channel */
+    xc_evtchn_close(cow->xce_handle);
+    cow->xce_handle = -1;
+    
+       /* Close connection to Xen */
+       xc_interface_close(cow->xc_handle);
+       cow->xc_handle = -1;
+
+       /* Free memory */
+    xencow_free(cow);
+}
+
+int xencow_snapshot(xencow_t *cow)
+{
+    int ret = -1;
+
+    cow_snapshots_lock(cow);
+    ret = xencow_take_snapshot(cow);
+    cow_snapshots_unlock(cow);
+
+    if ( ret != 0 )
+    {
+        ERROR("Error taking snapshot");
+        return ret;
+    }
+
+    return 0;
+}
+
+int xencow_wait_for_event_or_timeout(xencow_t *cow, unsigned long ms)
+{
+    struct pollfd fd = { .fd = cow->xce_handle, .events = POLLIN | POLLERR };
+    int port;
+    int rc;
+    
+    rc = poll(&fd, 1, ms);
+    if ( rc == -1 )
+    {
+        if (errno == EINTR)
+            return 0;
+        ERROR("Poll exited with an error");
+        return -2;
+    }
+    
+    if ( rc == 1 )
+    {
+        port = xc_evtchn_pending(cow->xce_handle);
+        if ( port == -1 )
+        {
+            ERROR("Failed to read port from event channel");
+            return -2;
+        }
+        
+        rc = xc_evtchn_unmask(cow->xce_handle, port);
+        if ( rc == -1 )
+        {
+            ERROR("Failed to unmask event channel port");
+            return -2;
+        }
+    }
+    else
+        port = -1;
+    
+    return port;
+}
+
+int xencow_wait_for_event(xencow_t *cow)
+{
+    return xencow_wait_for_event_or_timeout(cow, -1);
+}
+
+int xencow_page_type(xencow_t *cow, unsigned long pfn,
+                      unsigned long *count_info, unsigned long *type_info)
+{
+    unsigned long mfn;
+    int ret;
+    DECLARE_DOMCTL;
+    
+    mfn = xencow_p2m(cow, pfn);
+
+    domctl.cmd = XEN_DOMCTL_cow_page_type;
+    domctl.domain = cow->domain_id;
+    domctl.u.cow_page_type.mfn = mfn;
+
+    ret = do_domctl(cow->xc_handle, &domctl);
+    if ( ret != 0 )
+    {
+        *count_info = domctl.u.cow_page_type.count_info;
+        *type_info = domctl.u.cow_page_type.type_info;
+    }
+    
+    return ret;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/lib/xencow.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xencow.h Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,284 @@
+/******************************************************************************
+ * tools/xencow/lib/xencow.h
+ *
+ * VM memory Copy-on-Write library.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#ifndef __XEN_COW_H__
+#define __XEN_COW_H__
+
+
+#include <inttypes.h>
+#include <xen/xen.h>
+#include <xen/io/cow.h>
+#include <xen/event_channel.h>
+#include <xen/domctl.h>
+#include <xenctrl.h>
+#include <xc_private.h>
+#include "xencow_list.h"
+
+
+#define STATE_MFN  ((unsigned long)(-1))
+
+#define state_pfn(_pfn)  ((_pfn) == STATE_MFN)
+
+#define BUFFER_SIZE                                                          \
+    ((((PAGE_SIZE >> 1) / sizeof(unsigned long)) + XEN_COW_RING_PAGES)       \
+     << PAGE_SHIFT)
+
+
+#define offset_pfn(_offset)  ((_offset) >> PAGE_SHIFT)
+
+#define pfn_offset(_pfn)     (((off_t)(_pfn)) << PAGE_SHIFT)
+
+
+#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
+
+/* XXX: stolen from xen/asm/bitops.h */
+/* XXX: should these be in here? are they required to work externally? */
+#ifdef CONFIG_SMP
+#define LOCK_PREFIX "lock ; "
+#else
+#define LOCK_PREFIX ""
+#endif
+
+#define ADDR (*(volatile long *) addr)
+#define CONST_ADDR (*(const volatile long *) addr)
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered.  However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void clear_bit(int nr, volatile void *addr)
+{
+    asm volatile (
+        LOCK_PREFIX
+        "btrl %1,%0"
+        : "=m" (ADDR)
+        : "Ir" (nr), "m" (ADDR) : "memory");
+}
+
+/**
+ * test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_set_bit(int nr, volatile void *addr)
+{
+    int oldbit;
+
+    asm volatile (
+        LOCK_PREFIX
+        "btsl %2,%1\n\tsbbl %0,%0"
+        : "=r" (oldbit), "=m" (ADDR)
+        : "Ir" (nr), "m" (ADDR) : "memory");
+    return oldbit;
+}
+
+static inline int test_bit(int nr, const volatile void *addr)
+{
+    int oldbit;
+
+    asm volatile (
+        "btl %2,%1\n\tsbbl %0,%0"
+        : "=r" (oldbit)
+        : "m" (CONST_ADDR), "Ir" (nr) : "memory" );
+    return oldbit;
+}
+
+static inline int testandset (int *p)
+{
+    long int readval = 0;
+
+    __asm__ __volatile__ ("lock; cmpxchgl %2, %0"
+                          : "+m" (*p), "+a" (readval)
+                          : "r" (1)
+                          : "cc");
+    return readval;
+}
+
+
+/* Spin lock */
+typedef int spinlock_t;
+
+#define SPIN_LOCK_UNLOCKED 0
+
+static inline void spin_lock(spinlock_t *lock)
+{
+//    while ( test_and_set_bit(1, lock) );
+    while ( testandset(lock) );
+}
+
+static inline void spin_lock_init(spinlock_t *lock)
+{
+    *lock = SPIN_LOCK_UNLOCKED;
+}
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+    *lock = SPIN_LOCK_UNLOCKED;
+}
+
+static inline int spin_trylock(spinlock_t *lock)
+{
+    return !testandset(lock);
+}
+
+/* CoW ring lock */
+#define cow_ring_lock_init(_c)  spin_lock_init(&(_c)->ring_lock)
+#define cow_ring_lock(_c)       spin_lock(&(_c)->ring_lock)
+#define cow_ring_unlock(_c)     spin_unlock(&(_c)->ring_lock)
+
+/* CoW snapshots list */
+#define cow_snapshots_lock_init(_c)  spin_lock_init(&(_c)->snapshots_lock)
+#define cow_snapshots_lock(_c)       spin_lock(&(_c)->snapshots_lock)
+#define cow_snapshots_unlock(_c)     spin_unlock(&(_c)->snapshots_lock)
+
+
+typedef struct xencow_snapshot_st {
+    struct list_head list;
+
+    /* bitmap of PFNs that have been saved */
+    unsigned long   *bitmap;
+
+    /* when the snapshot was taken */
+    RING_IDX         when;
+
+    /* files for snapshot image */
+    char            *xencowfs_file;
+    char            *state_file;
+    char            *backing_file;
+} xencow_snapshot_t;
+
+typedef struct platform_info_st {
+    unsigned long max_mfn;
+    unsigned long hvirt_start;
+    unsigned int  pt_levels;
+    unsigned int  guest_width;
+} platform_info_t;
+
+typedef struct xencow_st {
+    domid_t          domain_id;
+    int              is_hvm;
+    unsigned long    p2m_size;
+    unsigned long   *live_p2m_table;
+
+    int              xc_handle;
+    int              xce_handle;
+    
+    evtchn_port_t    buffer_port;
+    evtchn_port_t    pause_port;
+    
+    platform_info_t  platform_info;
+
+    size_t           buffer_size;
+    void            *buffer;
+
+    int              num_mfns;
+    unsigned long   *mfns;
+
+    unsigned long    sring_mfn;
+    cow_front_ring_t front_ring;
+
+    spinlock_t       ring_lock;
+
+    void            *page_buffer;
+
+    struct list_head snapshots;
+    unsigned int     next_snapshot;
+
+    spinlock_t       snapshots_lock;
+} xencow_t;
+
+
+xen_pfn_t *xc_map_m2p(int xc_handle, unsigned long max_mfn, int prot,
+                      unsigned long *m2p_mfn0);
+
+int xc_canonicalise_pagetable(unsigned long type, unsigned long pfn,
+                              const void *spage, void *dpage,
+                              xen_pfn_t *live_p2m_table,
+                              xen_pfn_t *live_m2p_table, unsigned long 
m2p_mfn0,
+                              unsigned long p2m_size, unsigned long max_mfn,
+                              unsigned long hvirt_start, unsigned int 
pt_levels,
+                              unsigned int guest_width);
+
+xen_pfn_t *xc_get_live_p2m_table(int xc_handle, domid_t domain_id,
+                                 unsigned long p2m_size,
+                                 unsigned int guest_width);
+
+
+/* Initialise CoW for a domain */
+xencow_t *xencow_init(domid_t domid);
+
+/* Enable CoW */
+int xencow_enable(xencow_t *cow);
+
+/* Disable CoW */
+int xencow_disable(xencow_t *cow);
+
+/* Take a snapshot */
+int xencow_snapshot(xencow_t *cow);
+
+/* Resume a domain paused because of CoW */
+int xencow_resume(xencow_t *cow);
+
+/* Get the MFN for a PFN */
+unsigned long xencow_p2m(xencow_t *cow, unsigned long pfn);
+
+/* Cleanup a CoW struct */
+void xencow_cleanup(xencow_t *cow);
+
+/* Wait for an event */
+int xencow_wait_for_event(xencow_t *cow);
+int xencow_wait_for_event_or_timeout(xencow_t *cow, unsigned long ms);
+
+/* Flush the pre-dirtied page buffer */
+void xencow_flush_buffer(xencow_t *cow);
+
+/* Read pages from the pre-dirtied buffer */
+int xencow_read_buffer(xencow_t *cow, int snapshot_num, unsigned long 
start_pfn,
+                       int num_pages, void *buffer);
+
+/* Get info for a page */
+int xencow_page_type(xencow_t *cow, unsigned long pfn,
+                     unsigned long *count_info, unsigned long *type_info);
+
+
+#endif /* __XEN_COW_H__ */
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/lib/xencow_list.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xencow_list.h    Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,555 @@
+/******************************************************************************
+ * tools/xencow/lib/xencow_list.h
+ *
+ * Linked list.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#ifndef __XEN_COW_LIST_H__
+#define __XEN_COW_LIST_H__
+
+
+/* Taken from Linux kernel code, but de-kernelized for userspace. */
+#include <stddef.h>
+
+#undef LIST_HEAD_INIT
+#undef LIST_HEAD
+#undef INIT_LIST_HEAD
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+#define container_of(ptr, type, member) ({                     \
+        typeof( ((type *)0)->member ) *__mptr = (ptr); \
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+       struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+       (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+#define list_top(head, type, member)                                           
       \
+({                                                                             
                           \
+       struct list_head *_head = (head);                                       
           \
+       list_empty(_head) ? NULL : list_entry(_head->next, type, member);  \
+})
+
+#define list_bottom(head, type, member)                                \
+({                                                                     \
+    struct list_head *_head = (head);                                  \
+    list_empty(_head) ? NULL : list_entry(_head->prev, type, member);  \
+})
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+                             struct list_head *prev,
+                             struct list_head *next)
+{
+       next->prev = new;
+       new->next = next;
+       new->prev = prev;
+       prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head->prev, head);
+}
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static __inline__ void __list_add_rcu(struct list_head * new,
+       struct list_head * prev,
+       struct list_head * next)
+{
+       new->next = next;
+       new->prev = prev;
+       next->prev = new;
+       prev->next = new;
+}
+
+/**
+ * list_add_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static __inline__ void list_add_rcu(struct list_head *new, struct list_head 
*head)
+{
+       __list_add_rcu(new, head, head->next);
+}
+
+/**
+ * list_add_tail_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static __inline__ void list_add_tail_rcu(struct list_head *new, struct 
list_head *head)
+{
+       __list_add_rcu(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+       next->prev = prev;
+       prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       entry->next = LIST_POISON1;
+       entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_rcu - deletes entry from list without re-initialization
+ * @entry: the element to delete from the list.
+ *
+ * Note: list_empty on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the list.
+ */
+static inline void list_del_rcu(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+                                 struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(struct list_head *head)
+{
+       return head->next == head;
+}
+
+static inline void __list_splice(struct list_head *list,
+                                struct list_head *head)
+{
+       struct list_head *first = list->next;
+       struct list_head *last = list->prev;
+       struct list_head *at = head->next;
+
+       first->prev = head;
+       head->next = first;
+
+       last->next = at;
+       at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+       if (!list_empty(list))
+               __list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+                                   struct list_head *head)
+{
+       if (!list_empty(list)) {
+               __list_splice(list, head);
+               INIT_LIST_HEAD(list);
+       }
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:       the &struct list_head pointer.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+       container_of(ptr, type, member)
+
+/**
+ * list_for_each       -       iterate over a list
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each(pos, head) \
+       for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev  -       iterate over a list backwards
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+       for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+/**
+ * list_for_each_safe  -       iterate over a list safe against removal of 
list entry
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+       for (pos = (head)->next, n = pos->next; pos != (head); \
+               pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry -       iterate over list of given type
+ * @pos:       the type * to use as a loop counter.
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)                         \
+       for (pos = list_entry((head)->next, typeof(*pos), member);      \
+            &pos->member != (head);                                    \
+            pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:       the type * to use as a loop counter.
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)                 \
+       for (pos = list_entry((head)->prev, typeof(*pos), member);      \
+            &pos->member != (head);                                    \
+            pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+
+/**
+ * list_for_each_entry_continue -      iterate over list of given type
+ *                     continuing after existing point
+ * @pos:       the type * to use as a loop counter.
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_continue(pos, head, member)                \
+       for (pos = list_entry(pos->member.next, typeof(*pos), member);  \
+            &pos->member != (head);    \
+            pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against 
removal of list entry
+ * @pos:       the type * to use as a loop counter.
+ * @n:         another type * to use as temporary storage
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)                 \
+       for (pos = list_entry((head)->next, typeof(*pos), member),      \
+               n = list_entry(pos->member.next, typeof(*pos), member); \
+            &pos->member != (head);                                    \
+            pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+struct hlist_head {
+       struct hlist_node *first;
+};
+
+struct hlist_node {
+       struct hlist_node *next, **pprev;
+};
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL)
+
+static __inline__ int hlist_unhashed(struct hlist_node *h)
+{
+       return !h->pprev;
+}
+
+static __inline__ int hlist_empty(struct hlist_head *h)
+{
+       return !h->first;
+}
+
+static __inline__ void __hlist_del(struct hlist_node *n)
+{
+       struct hlist_node *next = n->next;
+       struct hlist_node **pprev = n->pprev;
+       *pprev = next;
+       if (next)
+               next->pprev = pprev;
+}
+
+static __inline__ void hlist_del(struct hlist_node *n)
+{
+       __hlist_del(n);
+       n->next = LIST_POISON1;
+       n->pprev = LIST_POISON2;
+}
+
+/**
+ * hlist_del_rcu - deletes entry from hash list without re-initialization
+ * @entry: the element to delete from the hash list.
+ *
+ * Note: list_unhashed() on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the hash list.
+ */
+static inline void hlist_del_rcu(struct hlist_node *n)
+{
+       __hlist_del(n);
+       n->pprev = LIST_POISON2;
+}
+
+static __inline__ void hlist_del_init(struct hlist_node *n)
+{
+       if (n->pprev)  {
+               __hlist_del(n);
+               INIT_HLIST_NODE(n);
+       }
+}
+
+#define hlist_del_rcu_init hlist_del_init
+
+static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head 
*h)
+{
+       struct hlist_node *first = h->first;
+       n->next = first;
+       if (first)
+               first->pprev = &n->next;
+       h->first = n;
+       n->pprev = &h->first;
+}
+
+static __inline__ void hlist_add_head_rcu(struct hlist_node *n, struct 
hlist_head *h)
+{
+       struct hlist_node *first = h->first;
+       n->next = first;
+       n->pprev = &h->first;
+       if (first)
+               first->pprev = &n->next;
+       h->first = n;
+}
+
+/* next must be != NULL */
+static __inline__ void hlist_add_before(struct hlist_node *n, struct 
hlist_node *next)
+{
+       n->pprev = next->pprev;
+       n->next = next;
+       next->pprev = &n->next;
+       *(n->pprev) = n;
+}
+
+static __inline__ void hlist_add_after(struct hlist_node *n,
+                                      struct hlist_node *next)
+{
+       next->next      = n->next;
+       *(next->pprev)  = n;
+       n->next         = next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+/* Cannot easily do prefetch unfortunately */
+#define hlist_for_each(pos, head) \
+       for (pos = (head)->first; pos; pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+       for (pos = (head)->first; n = pos ? pos->next : 0, pos; \
+            pos = n)
+
+/**
+ * hlist_for_each_entry        - iterate over list of given type
+ * @tpos:      the type * to use as a loop counter.
+ * @pos:       the &struct hlist_node to use as a loop counter.
+ * @head:      the head for your list.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)                   \
+       for (pos = (head)->first;                                        \
+            pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after 
existing point
+ * @tpos:      the type * to use as a loop counter.
+ * @pos:       the &struct hlist_node to use as a loop counter.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member)                \
+       for (pos = (pos)->next;                                          \
+            pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from existing 
point
+ * @tpos:      the type * to use as a loop counter.
+ * @pos:       the &struct hlist_node to use as a loop counter.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member)                    \
+       for (; pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against 
removal of list entry
+ * @tpos:      the type * to use as a loop counter.
+ * @pos:       the &struct hlist_node to use as a loop counter.
+ * @n:         another &struct hlist_node to use as temporary storage
+ * @head:      the head for your list.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member)           \
+       for (pos = (head)->first;                                        \
+            pos && ({ n = pos->next; 1; }) &&                           \
+               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = n)
+
+
+#endif /* __XEN_COW_LIST_H__ */
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/test/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/test/Makefile        Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,32 @@
+XEN_ROOT=../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS  += -I $(XEN_XC)
+CFLAGS  += $(CFLAGS_libxenctrl)
+
+SRCS    += cow_compare.c
+
+CFLAGS  += -Werror
+CFLAGS  += -g
+CFLAGS  += -Wl,-rpath,..
+
+LDFLAGS += $(LDFLAGS_libxenctrl) -lxencow
+
+OBJS     = $(SRCS:.c=.o)
+IBINS    = cow_compare
+
+all: $(IBINS)
+
+cow_compare: $(OBJS)
+       $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
+
+install: all
+       $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
+       $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(SBINDIR)
+
+clean:
+       rm -f *.o *~ $(DEPS) xen TAGS $(IBINS) $(LIB)
+
+.PHONY: clean install
+
+-include $(DEPS)
diff -r 0477f9061c8a tools/xencow/test/cow_compare.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/test/cow_compare.c   Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,896 @@
+/******************************************************************************
+ * tools/xencow/test/cow_compare.c
+ *
+ * Test application to compare CoW iamge and live memory dumps
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <string.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#if 0
+#include <xen/xen.h>
+#include <xc_private.h>
+#endif
+#include <xg_private.h>
+#include <xg_save_restore.h>
+#include <xenctrl.h>
+
+#include "../lib/xc.h"
+#include "../lib/xencow.h"
+
+
+#define COW 1
+
+#if COW
+#define COW_FILE1   "vm_dump.cow1"
+#define COW_FILE2   "vm_dump.cow2"
+#endif
+#define LIVE_FILE1  "vm_dump.live1"
+#define LIVE_FILE2  "vm_dump.live2"
+
+
+#define SLEEP_TIME  (10 * 1) /* 1 min */
+
+
+/* Printing functions */
+#if 1
+#define pr_debug(_f, _a...)                                                   \
+    printf("%s(): " _f, __func__, ##_a)
+#else
+#define pr_debug(_f, _a...)  ((void)0)
+#endif
+
+#define warning(_f, _a...)                                                    \
+    fprintf(stderr, "%s(): " _f, __func__, ##_a)
+
+
+static int debug_mode = 0;
+
+#if 0
+typedef struct page_info_st {
+    unsigned long count_info;
+    unsigned long type_info;
+} page_info_t;
+
+page_info_t get_page_type_info(int xc_handle, domid_t domain_id, xen_pfn_t mfn)
+{
+    page_info_t page_info;
+    int rc;
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_cow_page_type;
+    domctl.domain = domain_id;
+    domctl.u.cow_page_type.mfn = mfn;
+
+    rc = do_domctl(xc_handle, &domctl);
+    if ( rc != 0 )
+    {
+        printf("error getting page type for %lx\n", mfn);
+        page_info.count_info = 0;
+        page_info.type_info = 0;
+        return page_info;
+    }
+
+    page_info.count_info = domctl.u.cow_page_type.count_info;
+    page_info.type_info = domctl.u.cow_page_type.type_info;
+    
+    return page_info;
+}
+#endif
+
+int compare_pages(void *page1, void *page2, uint32_t page_size)
+{
+    uint32_t i;
+    int rc = 0;
+    
+    for ( i = 0; i < page_size; i++ )
+    {
+        if ( ((char *)page1)[i] != ((char *)page2)[i] )
+        {
+            rc--;
+#if VERBOSE
+            printf("images do not match at offset %x (%u): ", i, i);
+            printf("(%x) (%x)\n", ((char *)page1)[i], ((char *)page2)[i]);
+#endif
+        }
+    }
+    
+    return rc;
+}
+
+int compare(char *file1, char *file2, int xc_handle, domid_t domain_id,
+            xen_pfn_t *live_p2m_table, xen_pfn_t *live_m2p_table,
+            unsigned long m2p_mfn0,
+            unsigned long p2m_size, unsigned long max_mfn,
+            unsigned long hvirt_start, unsigned int pt_levels,
+            unsigned int guest_width)
+{
+    int open_flags = O_RDONLY;
+    mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH;
+    int fd1;
+    int fd2;
+    void *page1 = malloc(PAGE_SIZE);
+    void *page2 = malloc(PAGE_SIZE);
+    unsigned long pfn;
+    int hvm = 0;
+    int rc;
+    int ret = 0;
+
+    if ( !live_p2m_table )
+        hvm = 1;
+
+    /* Open image files */
+    fd1 = open(file1, open_flags, open_mode);
+    if ( fd1 < 0 )
+    {
+        perror("failed to open file1");
+        return -1;
+    }
+
+    fd2 = open(file2, open_flags, open_mode);
+    if ( fd2 < 0 )
+    {
+        perror("failed to open file2");
+        return -1;
+    }
+
+    /* Read images */
+    pfn = 0;
+    while ( pfn < p2m_size )
+    {
+        /* Read pages */
+        off64_t offset = pfn_offset(pfn);
+        off64_t ret_seek;
+        int total_read;
+        
+        ret_seek = lseek64(fd1, offset, SEEK_SET);
+#if 0
+        if ( ret_fd1 < 0 )
+        {
+            perror("failed to seek file1");
+            return -1;
+        }
+#endif
+
+        ret_seek = lseek64(fd2, offset, SEEK_SET);
+#if 0
+        if ( ret_fd2 < 0 )
+        {
+            perror("failed to seek file2");
+            return -1;
+        }
+#endif
+
+        total_read = 0;
+        while ( total_read < PAGE_SIZE )
+        {
+            void *p = page1 + total_read;
+            int bytes_read = read(fd1, p, PAGE_SIZE - total_read);
+            if ( bytes_read <= 0 )
+            {
+                perror("failed to read from file1");
+                return -1;
+            }
+            total_read += bytes_read;
+        }
+
+        total_read = 0;
+        while ( total_read < PAGE_SIZE )
+        {
+            void *p = page2 + total_read;
+            int bytes_read = read(fd2, p, PAGE_SIZE - total_read);
+            if ( bytes_read <= 0 )
+            {
+                perror("failed to read from file2");
+                return -1;
+            }
+            total_read += bytes_read;
+        }
+
+        rc = compare_pages(page1, page2, PAGE_SIZE);
+
+        /* Check if the pages are different */
+        if ( rc != 0 )
+        {
+            xen_pfn_t mfn;
+            int check_frame = 0;
+            int nonhypervisor_bytes = 0;
+
+            /* Get MFN */
+            if ( hvm )
+                mfn = pfn;
+            else
+                mfn = pfn_to_mfn(pfn);
+
+#if 1
+            if ( is_mapped(mfn) )
+            {
+#endif
+                if ( hvm )
+                { 
+                    if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != 
XEN_DOMCTL_PFINFO_XTAB )
+                        check_frame = 1;
+                }
+                else
+                {
+                    ((uint32_t *)(&mfn))[0] = mfn;
+                    mfn = (uint32_t)mfn;
+
+                    if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != 
XEN_DOMCTL_PFINFO_XTAB )
+                    {
+                        /* Canonicalise mfn -> pfn */
+                        mfn = (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) | pfn;
+                        check_frame = 1;
+                    }
+                }
+
+                if ( check_frame )
+                {
+                    unsigned long addr;
+                    unsigned long type;
+
+                    addr = mfn & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                    type = mfn &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                    /* Check if the page is present */
+                    if ( type != XEN_DOMCTL_PFINFO_XTAB )
+                    {
+                        type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+                        if ( (type >= XEN_DOMCTL_PFINFO_L1TAB) &&
+                             (type <= XEN_DOMCTL_PFINFO_L4TAB) )
+                        {
+                            int pte_last;
+                            int xen_start;
+                            int xen_end;
+                            int i;
+
+                            printf("page table page: %lx\n", pfn);
+                            
+                            /*
+                             * We need to determine which entries in this page 
table hold
+                             * reserved hypervisor mappings. This depends on 
the current
+                             * page table type as well as the number of paging 
levels.
+                             */
+                            xen_start = xen_end = pte_last = PAGE_SIZE / 
((pt_levels == 2) ? 4 : 8);
+
+                            if ( (pt_levels == 2) && (type == 
XEN_DOMCTL_PFINFO_L2TAB) )
+                                xen_start = (hvirt_start >> 
L2_PAGETABLE_SHIFT);
+
+                            if ( (pt_levels == 3) && (type == 
XEN_DOMCTL_PFINFO_L3TAB) )
+                                xen_start = L3_PAGETABLE_ENTRIES_PAE;
+
+                            /*
+                             * In PAE only the L2 mapping the top 1GB contains 
Xen mappings.
+                             * We can spot this by looking for the guest's 
mapping of the m2p.
+                             * Guests must ensure that this check will fail 
for other L2s.
+                             */
+                            if ( (pt_levels == 3) && (type == 
XEN_DOMCTL_PFINFO_L2TAB) )
+                            {
+                                int hstart;
+                                uint64_t he;
+
+                                hstart = (hvirt_start >> 
L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+                                he = ((const uint64_t *)page1)[hstart];
+
+                                if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == 
m2p_mfn0 )
+                                {
+                                    /* hvirt starts with xen stuff... */
+                                    xen_start = hstart;
+                                }
+                                else if ( hvirt_start != 0xf5800000 )
+                                {
+                                    /* old L2s from before hole was shrunk... 
*/
+                                    hstart = (0xf5800000 >> 
L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+                                    he = ((const uint64_t *)page1)[hstart];
+                                    if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) 
== m2p_mfn0 )
+                                        xen_start = hstart;
+                                }
+                            }
+
+                            if ( (pt_levels == 4) && (type == 
XEN_DOMCTL_PFINFO_L4TAB) )
+                            {
+                                /*
+                                 * XXX SMH: should compute these from 
hvirt_start (which we have)
+                                 * and hvirt_end (which we don't)
+                                 */
+                                xen_start = 256;
+                                xen_end   = 272;
+                            }
+
+                            /* 
+                             * Scan for changed bytes that aren't reserved by
+                             * the hypervisor
+                             */
+                            for ( i = 0; i < pte_last; i++ )
+                                if ( ((char *)page1)[i] != ((char *)page2)[i] )
+                                    if ( !((i >= xen_start) && (i < xen_end)) )
+                                        nonhypervisor_bytes++;
+                        }
+
+                        switch (type)
+                        {
+    
+                        case XEN_DOMCTL_PFINFO_NOTAB:
+                        {
+                            printf("  normal page: %lx: %d %d\n", pfn, -rc, 
nonhypervisor_bytes);
+                        }
+                        break;
+    
+                        case XEN_DOMCTL_PFINFO_L1TAB:
+                        {
+                            printf("  l1 table page: %lx: %d %d\n", pfn, -rc, 
nonhypervisor_bytes);
+                        }
+                        break;
+    
+                        case XEN_DOMCTL_PFINFO_L2TAB:
+                        {
+                            printf("  l2 table page: %lx: %d %d\n", pfn, -rc, 
nonhypervisor_bytes);
+                        }
+                        break;
+    
+                        case XEN_DOMCTL_PFINFO_L3TAB:
+                        {
+                            printf("  l3 table page: %lx: %d %d\n", pfn, -rc, 
nonhypervisor_bytes);
+                        }
+                        break;
+    
+                        case XEN_DOMCTL_PFINFO_L4TAB:
+                        {
+                            printf("  l4 table page: %lx: %d %d\n", pfn, -rc, 
nonhypervisor_bytes);
+                        }
+                        break;
+    
+                        case XEN_DOMCTL_PFINFO_LPINTAB:
+                        {
+                            printf("  pin page: %lx: %d %d\n", pfn, -rc, 
nonhypervisor_bytes);
+                        }
+                        break;
+    
+                        case XEN_DOMCTL_PFINFO_XTAB:
+                        {
+                            printf("  invalid page: %lx: %d %d\n", pfn, -rc, 
nonhypervisor_bytes);
+                        }
+                        break;
+    
+                        default:
+                            printf("  unknown page: %lx: %d %d\n", pfn, -rc, 
nonhypervisor_bytes);
+                        }
+
+                    }
+                }
+#if 1
+            }
+#endif
+
+            if ( debug_mode )
+                printf("images do not match at page %lx (%lx): %d (%d) bytes 
different\n",
+                       pfn, mfn, -rc, nonhypervisor_bytes);
+
+            if ( nonhypervisor_bytes + rc != 0)
+                ret--;
+        }
+
+        /* Move to next page */
+        pfn++;
+    }
+
+    close(fd1);
+    close(fd2);
+
+    return ret;
+}
+
+int dump_memory(char *filename, int xc_handle, domid_t domain_id,
+                xen_pfn_t *live_p2m_table, xen_pfn_t *live_m2p_table,
+                unsigned long m2p_mfn0,
+                unsigned long p2m_size, unsigned long max_mfn,
+                unsigned long hvirt_start, unsigned int pt_levels,
+                unsigned int guest_width)
+{
+    int open_flags = O_CREAT | O_TRUNC | O_RDWR;
+    mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR | S_IWGRP | 
S_IWOTH;
+    int fd;
+    unsigned long pfn;
+    void *page;
+    int hvm = 0;
+#if 1
+    int rc;
+#endif
+
+    if ( !live_p2m_table )
+        hvm = 1;
+
+    /* Open file */
+    fd = open(filename, open_flags, open_mode);
+    if ( fd < 0 )
+    {
+        perror("failed to open file");
+        return -1;
+    }
+
+    /* Write out memory contents */
+    pfn = 0;
+    while ( pfn < p2m_size )
+    {
+        size_t bytes_written;
+        xen_pfn_t mfn;
+        int copy_frame = 0;
+        int pt_page = 0;
+
+        page = NULL;
+
+        if ( hvm )
+            mfn = pfn;
+        else
+            mfn = pfn_to_mfn(pfn);
+
+        /* Read page */
+        if ( is_mapped(mfn) )
+        {
+            if ( hvm )
+            {
+                if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != 
XEN_DOMCTL_PFINFO_XTAB )
+                {
+                    page = xc_map_foreign_batch(xc_handle, domain_id, 
PROT_READ, &mfn, 1);
+                    copy_frame = 1;
+                }
+            }
+            else
+            {
+                page = xc_map_foreign_range(xc_handle, domain_id, PAGE_SIZE, 
PROT_READ, mfn);
+
+                ((uint32_t *)(&mfn))[0] = mfn;
+
+                rc = xc_get_pfn_type_batch(xc_handle, domain_id, 1, (uint32_t 
*)(&mfn));
+                if ( rc )
+                {
+                    ERROR("get_pfn_type_batch failed");
+                    goto out;
+                }
+                mfn = (uint32_t)mfn;
+
+                if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != 
XEN_DOMCTL_PFINFO_XTAB )
+                {
+                    /* Canonicalise mfn -> pfn */
+                    mfn = (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) | pfn;
+                    copy_frame = 1;
+                }
+                else
+                {
+                    munmap(page, PAGE_SIZE);
+                    page = NULL;
+                }
+            }
+
+#if 0
+            if ( copy_frame )
+            {
+                unsigned long addr;
+                unsigned long type;
+
+                addr = mfn & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                type = mfn &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                /* Check if the page is present */
+                if ( type != XEN_DOMCTL_PFINFO_XTAB )
+                {
+                    type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+                    if ( (type >= XEN_DOMCTL_PFINFO_L1TAB) &&
+                         (type <= XEN_DOMCTL_PFINFO_L4TAB) )
+                    {
+                        int race;
+                        void *dpage = malloc(PAGE_SIZE);
+
+                        race = xc_canonicalise_pagetable(type, addr, page, 
dpage,
+                                                         live_p2m_table,
+                                                         live_m2p_table,
+                                                         m2p_mfn0,
+                                                         p2m_size, max_mfn,
+                                                         hvirt_start, 
pt_levels,
+                                                         guest_width);
+
+                        munmap(page, PAGE_SIZE);
+                        page = dpage;
+                        pt_page = 1;
+                    }
+                }
+                else
+                {
+                    munmap(page, PAGE_SIZE);
+                    copy_frame = 0;
+                }
+            }
+#endif
+            if ( copy_frame )
+            {
+                off64_t seek_ret;
+                int total_written = 0;
+
+                seek_ret = lseek64(fd, pfn_offset(pfn), SEEK_SET);
+
+                while ( total_written < PAGE_SIZE )
+                {
+                    void *p = page + total_written;
+                    bytes_written = write(fd, p, PAGE_SIZE - total_written);
+
+                    if ( bytes_written <= 0 )
+                    {
+                        perror("failed to write to file");
+                        return -1;
+                    }
+
+                    total_written += bytes_written;
+                }
+
+                /* Free or unmap page if needed */
+                if ( pt_page )
+                    free(page);
+                else
+                    munmap(page, PAGE_SIZE);
+            }
+        }
+        
+        pfn++;
+    }
+
+#if 1
+ out:
+#endif
+    close(fd);
+
+    return 0;
+}
+
+#if COW
+int dump_cow(char *filename, unsigned long p2m_size, char *cow_file)
+{
+    int open_flags = O_CREAT | O_TRUNC | O_RDWR;
+    mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR | S_IWGRP | 
S_IWOTH;
+    int fd_cow;
+    int fd_dump;
+    unsigned long pfn;
+    void *page = malloc(PAGE_SIZE);
+
+    /* Open files */
+    fd_cow = open(cow_file, O_RDONLY, open_mode);
+    if ( fd_cow < 0 )
+    {
+        perror("failed to open cow file");
+        return -1;
+    }
+
+    fd_dump = open(filename, open_flags, open_mode);
+    if ( fd_dump < 0 )
+    {
+        perror("failed to open dump file");
+        return -1;
+    }
+
+    /* Write out memory contents */
+    pfn = 0;
+    while ( pfn < p2m_size )
+    {
+        off64_t seek_ret;
+        int total_read;
+        int total_written;
+
+        /* Read page */
+        seek_ret = lseek64(fd_cow, pfn_offset(pfn), SEEK_SET);
+#if 0
+        if ( ret < 0 )
+        {
+            perror("failed to seek cow file");
+            return -1;
+        }
+#endif
+
+        total_read = 0;
+        while ( total_read < PAGE_SIZE )
+        {
+            void *p = page + total_read;
+            int bytes_read = read(fd_cow, p, PAGE_SIZE - total_read);
+            if ( bytes_read <= 0 )
+            {
+                perror("failed to read cow file");
+                return -1;
+            }
+
+            total_read += bytes_read;
+        }
+
+        /* Write memory contents to file */
+        seek_ret = lseek64(fd_dump, pfn_offset(pfn), SEEK_SET);
+#if 0
+        if ( ret < 0 )
+        {
+            perror("failed to seek cow file");
+            return -1;
+        }
+#endif
+
+        total_written = 0;
+        while ( total_written < PAGE_SIZE )
+        {
+            void *p = page + total_written;
+            int bytes_written = write(fd_dump, p, PAGE_SIZE - total_written);
+            if ( bytes_written <= 0 )
+            {
+                perror("failed to write dump file");
+                return -1;
+            }
+
+            total_written += bytes_written;
+        }
+
+        pfn++;
+    }
+
+    close(fd_dump);
+    close(fd_cow);
+
+    return 0;
+}
+#endif
+
+int main(int argc, char *argv[])
+{
+    xc_dominfo_t info;
+    domid_t domain_id;
+    int snapshot_num;
+    int xc_handle;
+    xen_pfn_t *live_m2p_table;
+    xen_pfn_t *live_p2m_table;
+    unsigned long m2p_mfn0;
+    unsigned long p2m_size;
+    unsigned long max_mfn;
+    unsigned long hvirt_start;
+    unsigned int pt_levels;
+    unsigned int guest_width;
+#if 1
+#if COW
+    char fuse_file[200];
+#endif
+#endif
+    int rc;
+
+    domain_id = atoi(argv[1]);
+    /*  TODO: find this automatically */
+    snapshot_num = atoi(argv[2]);
+    
+    if ( argc > 3 )
+    {
+        if ( strcmp(argv[3], "-d") == 0 )
+            debug_mode = 1;
+    }
+
+    /* Open connection to Xen */
+    rc = xc_interface_open();
+    if ( rc < 0 )
+    {
+        warning("failed to connect to Xen\n");
+        goto out;
+    }
+    xc_handle = rc;
+
+    /* Get some info */
+    p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &domain_id) + 1;
+
+    rc = get_platform_info(xc_handle, domain_id, &max_mfn, &hvirt_start,
+                           &pt_levels, &guest_width);
+    if ( rc != 1 )
+    {
+        warning("failed to get platform info\n");
+        goto out;
+    }
+
+    /* Get HVM info */
+    rc = xc_domain_getinfo(xc_handle, domain_id, 1, &info);
+    if ( rc != 1 )
+    {
+        warning("failed to get domain info\n");
+        goto out;
+    }
+    
+    /* Print info */
+    if ( debug_mode )
+    {
+        printf("p2m_size: %lu\n", p2m_size);
+        printf("max_mfn: %lx (%lu)\n", max_mfn, max_mfn);
+        printf("hvirt_start: %lx (%lu)\n", hvirt_start, hvirt_start);
+        printf("pt_levels: %x (%u)\n", pt_levels, pt_levels);
+        printf("guest_width: %x (%u)\n", guest_width, guest_width);
+        printf("shared_info_frame: %lx (%lu)\n", info.shared_info_frame, 
info.shared_info_frame);
+    }
+
+    /* Setup the ofn to mfn table mapping */
+    if ( info.hvm )
+    {
+        if ( debug_mode )
+            printf("HVM guest\n");
+
+        live_p2m_table = NULL;
+    }
+    else
+    {
+        if ( debug_mode )
+            printf("PV guest\n");
+
+        /* Get live p2m table */
+        live_p2m_table = xc_get_live_p2m_table(xc_handle, domain_id, p2m_size,
+                                               guest_width);
+        if ( !live_p2m_table )
+        {
+            warning("failed to get live p2m table\n");
+            goto out;
+        }
+    }
+
+    /* Setup the mfn to pfn table mapping */
+    live_m2p_table = xc_map_m2p(xc_handle, max_mfn, PROT_READ, &m2p_mfn0);
+    if ( !live_m2p_table )
+    {
+        warning("failed to map live m2p table\n");
+        goto out;
+    }
+
+    /* Pause domain */
+    printf("Pausing domain\n");
+    rc = xc_domain_pause(xc_handle, domain_id);
+    if ( rc != 0 )
+    {
+        warning("failed to pause domain");
+        goto out;
+    }
+    sleep(1);
+
+#if 1
+#if COW
+    /* Take snapshot */
+    printf("Taking snapshot\n");
+    rc = system("touch /tmp/foo/1");
+    sleep(1);
+
+    /* Dump CoW image */
+    printf("Dumping CoW image (1)... ");
+    fflush(stdout);
+    sprintf(fuse_file, "/tmp/foo/%d.%d", domain_id, snapshot_num);
+    dump_cow(COW_FILE1, p2m_size, fuse_file);
+    printf("done\n");
+#endif
+#endif
+
+    /* Dump live VM image */
+    printf("Dumping live VM image (1)... ");
+    fflush(stdout);
+    dump_memory(LIVE_FILE1, xc_handle, domain_id, live_p2m_table, 
live_m2p_table,
+                m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, 
guest_width);
+    printf("done\n");
+
+    /* Dump another live VM image */
+    printf("Dumping live VM image (2)... ");
+    fflush(stdout);
+    dump_memory(LIVE_FILE2, xc_handle, domain_id, live_p2m_table, 
live_m2p_table,
+                m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, 
guest_width);
+    printf("done\n");
+
+    /* Unpause domain */
+    printf("Unpausing domain\n");
+    rc = xc_domain_unpause(xc_handle, domain_id);
+    if ( rc != 0 )
+    {
+        warning("failed to unpause domain");
+        goto out;
+    }
+
+#if 1
+#if COW
+    /* Let domain run for a bit */
+    printf("Sleeping for %d seconds\n", SLEEP_TIME);
+    sleep(SLEEP_TIME);
+
+    /* Dump CoW image */
+    printf("Dumping CoW image (2)... ");
+    fflush(stdout);
+    sprintf(fuse_file, "/tmp/foo/%d.0", domain_id);
+    dump_cow(COW_FILE2, p2m_size, fuse_file);
+    printf("done\n");
+#endif
+#endif
+
+    /* Compare images*/
+    printf("-- Comparing images --\n");
+
+    printf("Comparing live1 live2...\n");
+    rc = compare(LIVE_FILE1, LIVE_FILE2, xc_handle, domain_id, live_p2m_table, 
live_m2p_table,
+                 m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, 
guest_width);
+    if ( rc != 0 )
+        printf("Images do not match (%d pages different)\n", -rc);
+    else
+        printf("Images match\n");
+
+#if COW
+    printf("Comparing cow1 cow2...\n");
+    rc = compare(COW_FILE1, COW_FILE2, xc_handle, domain_id, live_p2m_table, 
live_m2p_table,
+                 m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, 
guest_width);
+    if ( rc != 0 )
+        printf("Images do not match (%d pages different)\n", -rc);
+    else
+        printf("Images match\n");
+
+    printf("Comparing live1 cow1...\n");
+    rc = compare(LIVE_FILE1, COW_FILE1, xc_handle, domain_id, live_p2m_table, 
live_m2p_table,
+                 m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, 
guest_width);
+    if ( rc != 0 )
+        printf("Images do not match (%d pages different)\n", -rc);
+    else
+        printf("Images match\n");
+
+    printf("Comparing live1 cow2...\n");
+//    rc = compare(LIVE_FILE1, COW_FILE2, PAGE_SIZE, p2m_size, xc_handle, 
domain_id, live_p2m_table);
+    rc = compare(LIVE_FILE1, COW_FILE2, xc_handle, domain_id, live_p2m_table, 
live_m2p_table,
+                 m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, 
guest_width);
+    if ( rc != 0 )
+        printf("Images do not match (%d pages different)\n", -rc);
+    else
+        printf("Images match\n");
+
+    printf("Comparing live2 cow1...\n");
+//    rc = compare(LIVE_FILE2, COW_FILE1, PAGE_SIZE, p2m_size, xc_handle, 
domain_id, live_p2m_table);
+    rc = compare(LIVE_FILE2, COW_FILE1, xc_handle, domain_id, live_p2m_table, 
live_m2p_table,
+                 m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, 
guest_width);
+    if ( rc != 0 )
+        printf("Images do not match (%d pages different)\n", -rc);
+    else
+        printf("Images match\n");
+
+    printf("Comparing live2 cow2...\n");
+//    rc = compare(LIVE_FILE2, COW_FILE2, PAGE_SIZE, p2m_size, xc_handle, 
domain_id, live_p2m_table);
+    rc = compare(LIVE_FILE2, COW_FILE2, xc_handle, domain_id, live_p2m_table, 
live_m2p_table,
+                 m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, 
guest_width);
+    if ( rc != 0 )
+        printf("Images do not match (%d pages different)\n", -rc);
+    else
+        printf("Images match\n");
+#endif
+
+ out:
+    return 0;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/xencowfs/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/xencowfs/Makefile    Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,35 @@
+XEN_ROOT=../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS   += -I $(XEN_XC)
+CFLAGS   += -I ../lib
+CFLAGS   += $(CFLAGS_libxenctrl)
+
+SRCS     += xencowfs.c
+
+CFLAGS   += -Werror
+CFLAGS   += -Wno-unused
+CFLAGS   += -D_FILE_OFFSET_BITS=64
+CFLAGS   += -g
+CFLAGS   += -Wl,-rpath,..
+
+LDFLAGS  += $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenguest) -L../lib -lxencow 
-lfuse -lpthread
+
+OBJS     = $(SRCS:.c=.o)
+IBINS    = xencowfs
+
+all: $(IBINS)
+
+xencowfs: $(OBJS)
+       $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
+
+install: all
+       $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
+       $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(SBINDIR)
+
+clean:
+       rm -f *.o *~ $(DEPS) xen TAGS $(IBINS) $(LIB)
+
+.PHONY: clean install
+
+-include $(DEPS)
diff -r 0477f9061c8a tools/xencow/xencowfs/README
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/xencowfs/README      Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,24 @@
+
+
+First, launch the desired target domain. Create a mount point for the xencow 
FUSE module to use and from the tools/xencow/xencowfs directory, run:
+
+sudo ./xencowfs <mount point> <domid>
+
+This will initialise CoW for the domain. To take a snapshot, simply poke the 
FUSE mount point (e.g. touch <mount point>/1). The file name doesn't matter as 
FUSE will create it's own file named:
+
+<mount point>/<domid>.<snapshot>
+
+e.g. xencow/1.0, xencow/1.1
+
+Currently, the backing files are hardcoded to appear in /tmp with the 
following names:
+
+/tmp/xencow<domid>.<snapshot>
+/tmp/xencow<domid>.<snapshot>.state
+
+The <domid>.<snapshot> pair corresponds to the FUSE file. The .state file 
contains state pages (e.g. CPU registers), while the other file contains the 
pre-dirtied pages for that domain.
+
+It is possible to use XenAccess in file mode to access the snapshot image. A 
slightly modified version of the memory-dump example from XenAccess 0.5 is 
included which is designed to work on the CoW image. To use it, run (I've only 
tried running it from the xenaccess-0.5/examples/):
+
+sudo dump-memory-cow <FUSE image file> <output file>
+
+This will create a complete memory image of the running domain at the time the 
snapshot was taken.
\ No newline at end of file
diff -r 0477f9061c8a tools/xencow/xencowfs/xencowfs.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/xencowfs/xencowfs.c  Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,253 @@
+/******************************************************************************
+ * tools/xencow/xencowfs/xencowfs.c
+ *
+ * VM memory Copy-on-Write FUSE module
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#define FUSE_USE_VERSION 26
+
+
+#include <fuse.h>
+#include <string.h>
+#include <errno.h>
+//#include <xc_private.h>
+
+#include "../lib/xencow.h"
+
+
+#define DEBUG_OUTPUT  0
+
+
+static xencow_t *cow;
+
+static inline int get_snapshot_num(const char *path)
+{
+    return atoi(strrchr(path, '.'));
+}
+
+static int path_exists(const char *path)
+{
+    xencow_snapshot_t *snapshot;
+
+    list_for_each_entry ( snapshot, &cow->snapshots, list )
+        if ( (strcmp(snapshot->xencowfs_file, path) == 0)
+             || ((path[0] == '/')
+                 && (strcmp(snapshot->xencowfs_file, path + 1) == 0)) )
+            return 1;
+
+    return 0;
+}
+
+static int xencowfs_create(const char *path, mode_t mode,
+                           struct fuse_file_info *fi)
+{
+    return xencow_snapshot(cow);
+}
+
+static void xencowfs_destroy(void *data)
+{
+    (void) data;
+
+    xencow_disable(cow);
+}
+
+static int xencowfs_getattr(const char *path, struct stat *stat)
+{
+    int res = 0;
+
+    memset(stat, 0, sizeof(struct stat));
+
+    if ( strcmp(path, "/") == 0 )
+    {
+        stat->st_mode = S_IFDIR | 0755;
+        stat->st_nlink = 2;
+    }
+    else if ( path_exists(path) )
+    {
+        stat->st_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH;
+        stat->st_nlink = 1;
+        stat->st_size = cow->p2m_size << PAGE_SHIFT;
+        stat->st_blksize = PAGE_SIZE;
+        stat->st_blocks = cow->p2m_size;
+    }
+    else
+        res = -ENOENT;
+
+    return res;
+}
+
+static void *xencowfs_init(struct fuse_conn_info *conn)
+{
+    return NULL;
+}
+
+static int xencowfs_open(const char *path, struct fuse_file_info *fi)
+{
+    if ( !path_exists(path) )
+        return -ENOENT;
+
+    if ( (fi->flags & 3) != O_RDONLY )
+        return -EACCES;
+
+    return 0;
+}
+
+static int xencowfs_read(const char *path, char *buffer, size_t size,
+                         off_t offset, struct fuse_file_info *fi)
+{
+    int snapshot_num;
+    unsigned long start_pfn;
+    int num_pages;
+    int ret;
+
+    (void) fi;
+
+    /* TODO: Worry about offsets not page aligned */
+    start_pfn = offset_pfn(offset);
+    num_pages = size >> PAGE_SHIFT;
+    
+    /* Page align check */
+    if ( pfn_offset(start_pfn) != offset )
+        ERROR("Offset not page aligned!");
+    
+    /* Check that it doesn't read past the end of the domain's memory */
+    if ( start_pfn + num_pages > cow->p2m_size )
+        num_pages = cow->p2m_size - start_pfn;
+
+    IPRINTF("size = %lx; offset = %lx; num_pages = %d\n",
+            (unsigned long)size, (unsigned long)offset, num_pages);
+
+    /* Get the appropriate snapshot */
+    snapshot_num = get_snapshot_num(path);
+
+    ret = xencow_read_buffer(cow, snapshot_num, start_pfn, num_pages, buffer);
+    if ( ret != 0 )
+        goto out;
+
+    ret = size;
+
+ out:
+    return ret;
+}
+
+static int xencowfs_readdir(const char *path, void *buffer,
+                            fuse_fill_dir_t filler, off_t offset,
+                            struct fuse_file_info *fi)
+{
+    xencow_snapshot_t *snapshot;
+
+    (void) offset;
+    (void) fi;
+
+    if ( strcmp(path, "/") != 0)
+        return -ENOENT;
+
+    filler(buffer, ".", NULL, 0);
+    filler(buffer, "..", NULL, 0);
+
+    list_for_each_entry(snapshot, &cow->snapshots, list)
+        filler(buffer, snapshot->xencowfs_file, NULL, 0);
+
+    return 0;
+}
+
+static int xencowfs_statfs(const char *path, struct statvfs *buf)
+{
+    (void) path;
+    
+    buf->f_bsize = PAGE_SIZE;
+    buf->f_blocks = cow->p2m_size;
+    buf->f_bfree = 0;
+    buf->f_bavail = 0;
+    buf->f_files = 0;
+    buf->f_ffree = 0;
+    buf->f_fsid = 0;
+    buf->f_namemax = 255;
+    buf->f_favail = 0;
+    buf->f_frsize = buf->f_blocks;
+    buf->f_flag = 0;
+
+    return 0;
+}
+
+static struct fuse_operations xencowfs_oper = {
+    .create     = xencowfs_create,
+    .destroy    = xencowfs_destroy,
+    .getattr    = xencowfs_getattr,
+    .init       = xencowfs_init,
+    .open       = xencowfs_open,
+    .read       = xencowfs_read,
+    .readdir    = xencowfs_readdir,
+    .statfs     = xencowfs_statfs,
+};
+
+int main(int argc, char *argv[])
+{
+    domid_t domid;
+    int rc;
+
+    IPRINTF("Start\n");
+
+    /* The last arg is the domain number */
+    rc = -EINVAL;
+    domid = atoi(argv[argc - 1]);
+    if ( domid == 0 )
+    {
+        ERROR("Invalid domain");
+        exit(rc);
+    }
+
+    /* Initialise CoW */
+    IPRINTF("Initialise CoW\n");
+    rc = -ENOMEM;
+    cow = xencow_init(domid);
+    if ( cow == NULL )
+    {
+        ERROR("Could not initialise CoW");
+        exit(rc);
+    }
+
+    /* Enable CoW */
+    IPRINTF("Enable CoW\n");
+    rc = xencow_enable(cow);
+    if ( rc != 0 )
+    {
+        ERROR("Could not enable CoW: rc = %d", rc);
+        exit(rc);
+    }
+    IPRINTF("CoW enabled\n");
+
+    rc = fuse_main(argc - 1, argv, &xencowfs_oper, NULL);
+    if ( rc != 0 )
+       xencow_disable(cow);
+
+    return rc;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c     Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/domctl.c     Mon Apr 20 10:21:49 2009 -0700
@@ -28,6 +28,7 @@
 #include <asm/processor.h>
 #include <xsm/xsm.h>
 #include <xen/iommu.h>
+#include <asm/cow.h>
 
 long arch_do_domctl(
     struct xen_domctl *domctl,
@@ -1087,6 +1088,132 @@
     }
     break;
 
+    /* TODO: replace with XEN_DOMCTL_cow_op */
+    case XEN_DOMCTL_cow_enable:
+    {
+        struct domain *d;
+        void *ring_page;
+
+        ret = -EINVAL;
+        if ( domctl->domain == current->domain->domain_id )
+            break;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        /* FIXME: Some other error code? */
+        ret = -EINVAL;
+        ring_page = map_domain_page_global(domctl->u.cow_enable.mfn);
+        if ( ring_page == NULL )
+            goto cow_enable_out;
+
+        BACK_RING_INIT(&d->arch.paging.cow.back_ring, (cow_sring_t 
*)ring_page, PAGE_SIZE);
+
+        ret = 0;
+
+    cow_enable_out:
+        printk("CoW: enabled: ret: %ld\n", ret);
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_cow_snapshot:
+    {
+        struct domain *d;
+        RING_IDX when;
+
+        ret = -EINVAL;
+        if ( domctl->domain == current->domain->domain_id )
+            break;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        ret = cow_snapshot(d, &when);
+        if ( ret != 0 )
+            goto cow_snapshot_out;
+
+        domctl->u.cow_snapshot.when = when;
+        ret = 0;
+
+        if ( copy_to_guest(u_domctl, domctl, 1) )
+            ret = -EFAULT;
+
+    cow_snapshot_out:
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_cow_resume:
+    {
+        struct domain *d;
+
+
+        ret = -EINVAL;
+        if ( domctl->domain == current->domain->domain_id )
+            break;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        ret = cow_resume(d);
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_cow_disable:
+    {
+        struct domain *d;
+
+        ret = -EINVAL;
+        if ( domctl->domain == current->domain->domain_id )
+            break;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        ret = paging_log_dirty_disable(d);
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_cow_page_type:
+    {
+        struct domain *d;
+        struct page_info *page;
+
+        ret = -EINVAL;
+        if ( domctl->domain == current->domain->domain_id )
+            break;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        page = mfn_to_page(domctl->u.cow_page_type.mfn);
+
+        domctl->u.cow_page_type.count_info = page->count_info;
+        domctl->u.cow_page_type.type_info = page->u.inuse.type_info;
+        ret = 0;
+
+        if ( copy_to_guest(u_domctl, domctl, 1) )
+            ret = -EFAULT;
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
     default:
         ret = -ENOSYS;
         break;
diff -r 0477f9061c8a xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/hvm/hvm.c    Mon Apr 20 10:21:49 2009 -0700
@@ -1542,8 +1542,8 @@
             }
             else
             {
+                paging_mark_dirty(curr->domain, mfn);
                 memcpy(p, buf, count);
-                paging_mark_dirty(curr->domain, mfn);
             }
         }
         else
diff -r 0477f9061c8a xen/arch/x86/mm/Makefile
--- a/xen/arch/x86/mm/Makefile  Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/mm/Makefile  Mon Apr 20 10:21:49 2009 -0700
@@ -6,6 +6,7 @@
 obj-y += guest_walk_2.o
 obj-y += guest_walk_3.o
 obj-$(x86_64) += guest_walk_4.o
+obj-y += cow.o
 
 guest_walk_%.o: guest_walk.c Makefile
        $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
diff -r 0477f9061c8a xen/arch/x86/mm/cow.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/cow.c     Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,832 @@
+/******************************************************************************
+ * arch/x86/mm/cow.c
+ *
+ * CoW paging support
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ * Parts based on earlier work by Geoffrey Lefebvre
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#include <asm/cow.h>
+#include <asm/paging.h>
+#include <xen/event.h>
+
+
+#define COW_DOMAIN_PAUSE 0
+#define COW_DEBUG_OUTPUT 0
+
+
+/* Printouts */
+#define PAGING_PRINTK(_f, _a...)                                             \
+    debugtrace_printk("pg: %s(): " _f, __func__, ##_a)
+#define PAGING_ERROR(_f, _a...)                                              \
+    printk("pg error: %s(): " _f, __func__, ##_a)
+#if COW_DEBUG_OUTPUT
+#define PAGING_DEBUG(flag, _f, _a...)                                        \
+    do {                                                                     \
+        if (PAGING_DEBUG_ ## flag)                                           \
+            printk("pgdebug: %s(): " _f, __func__, ##_a);                    \
+    } while (0)
+#else
+#define PAGING_DEBUG(flag, _f, _a...)                                        \
+    do {                                                                     \
+        if (PAGING_DEBUG_ ## flag)                                           \
+            debugtrace_printk("pgdebug: %s(): " _f, __func__, ##_a);         \
+    } while (0)
+#endif
+
+
+#define STATE_MFN  ((unsigned long)(-1))
+
+#define xen_mb()   mb()
+#define xen_rmb()  rmb()
+#define xen_wmb()  wmb()
+
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_to_page
+#define mfn_to_page(_m) (frame_table + mfn_x(_m))
+#undef mfn_valid
+#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
+#undef page_to_mfn
+#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
+
+/* The CoW lock. This protects the log-dirty bitmap from concurrent accesses
+ * (and teardowns, etc).
+ *
+ * Locking discipline: always acquire log dirty lock before this one. */
+
+#define cow_lock_init(_d)                                                    \
+    do {                                                                     \
+        spin_lock_init(&(_d)->arch.paging.cow.lock);                         \
+        (_d)->arch.paging.cow.locker = -1;                                   \
+        (_d)->arch.paging.cow.locker_function = "nobody";                    \
+    } while (0)
+
+#define cow_lock(_d)                                                         \
+    do {                                                                     \
+        if (unlikely((_d)->arch.paging.cow.locker==current->processor))      \
+        {                                                                    \
+            printk("Error: paging cow lock held by %s\n",                    \
+                   (_d)->arch.paging.cow.locker_function);                   \
+            BUG();                                                           \
+        }                                                                    \
+        spin_lock(&(_d)->arch.paging.cow.lock);                              \
+        ASSERT((_d)->arch.paging.cow.locker == -1);                          \
+        (_d)->arch.paging.cow.locker = current->processor;                   \
+        (_d)->arch.paging.cow.locker_function = __func__;                    \
+    } while (0)
+
+#define cow_unlock(_d)                                                       \
+    do {                                                                     \
+        ASSERT((_d)->arch.paging.cow.locker == current->processor);          \
+        (_d)->arch.paging.cow.locker = -1;                                   \
+        (_d)->arch.paging.cow.locker_function = "nobody";                    \
+        spin_unlock(&(_d)->arch.paging.cow.lock);                            \
+    } while (0)
+
+
+/* XXX: ugly cut and paste from common/grant_table.c */
+#define ACGNT_PER_PAGE     (PAGE_SIZE / sizeof(struct active_grant_entry))
+#define active_entry(t, e) 
((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
+
+
+static void cow_notify_dom0_pause(unsigned long unused)
+{
+    printk("cow: notifying dom0 that domain is paused\n");
+    send_guest_global_virq(dom0, VIRQ_COW_PAUSE);
+}
+static DECLARE_TASKLET(cow_notify_dom0_pause_tasklet, cow_notify_dom0_pause, 
0);
+
+static void cow_notify_dom0_high_water(unsigned long flag_addr)
+{
+    printk("cow: notifying dom0 that ring buffer passed high water mark\n");
+    send_guest_global_virq(dom0, VIRQ_COW_BUFFER);
+    (*(bool_t *)flag_addr) = 0;
+}
+static DECLARE_TASKLET(cow_notify_dom0_high_water_tasklet, 
cow_notify_dom0_high_water, 0);
+
+static void paging_free_cow_bitmap(unsigned long **bitmap)
+{
+    if ( likely(*bitmap != NULL) )
+    {
+        printk("cow: freeing bitmap\n");
+        xfree(*bitmap);
+        *bitmap = NULL;
+    }
+}
+
+static void paging_free_cow(struct domain *d)
+{
+    printk("cow: freeing bitmaps\n");
+    paging_free_cow_bitmap(&d->arch.paging.cow.precow_foreign_bitmap);
+    paging_free_cow_bitmap(&d->arch.paging.cow.bitmap);
+}
+
+static int paging_alloc_cow_bitmap(unsigned long **bitmap,
+                                   unsigned long bitmap_size)
+{
+    BUG_ON(bitmap_size == 0);
+
+    if ( unlikely(*bitmap == NULL) )
+    {
+        *bitmap = xmalloc_array(unsigned long, bitmap_size / BITS_PER_LONG);
+
+        if ( unlikely(*bitmap == NULL) )
+            return -ENOMEM;
+    }
+
+    memset(*bitmap, 0, bitmap_size / 8);
+
+    return 0;
+}
+
+/* Get address of current buffer page for a given domain */
+static unsigned long cow_get_buffer_page(struct domain *d)
+{
+    cow_request_t req;
+    cow_back_ring_t *back_ring;
+    RING_IDX req_cons;
+
+    cow_ring_lock(d);
+
+    back_ring = &d->arch.paging.cow.back_ring;
+    req_cons = back_ring->req_cons;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: xen_page_for_domain %d\n", d->domain_id);
+#endif
+
+    /* Get buffer page */
+    memcpy(&req, (RING_GET_REQUEST(back_ring, req_cons)), sizeof(req));
+    req_cons++;
+
+    back_ring->req_cons = req_cons;
+    back_ring->sring->req_event = req_cons + 1;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: num: %ld  buffer mfn %" PRI_mfn "\n", (unsigned 
long)req_cons, req.mfn);
+#endif
+
+    cow_ring_unlock(d);
+
+    return req.mfn;
+}
+
+static void cow_copy_page(struct domain *d, unsigned long guest_mfn,
+                                 void *guest_page)
+{
+    mfn_t gmfn;
+    unsigned long pfn;
+    unsigned long buffer_mfn;
+    void *buffer_page;
+    cow_response_t rsp;
+    cow_back_ring_t *back_ring;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: copy page: start\n");
+#endif
+
+    /* We /really/ mean PFN here, even for non-translated guests. */
+    if ( guest_mfn != STATE_MFN )
+    {
+        gmfn = _mfn(guest_mfn);
+        pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+        BUG_ON(!VALID_M2P(pfn));
+    }
+    else
+        pfn = STATE_MFN;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: copy page: locking ring\n");
+#endif
+
+    buffer_mfn = cow_get_buffer_page(d);
+    buffer_page = map_domain_page(buffer_mfn);
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: copy page: mapped buffer page\n");
+#endif
+
+    /* Copy page */
+#if COW_DEBUG_OUTPUT
+    printk("cow: copy guest page\n");
+#endif
+    memcpy(buffer_page, guest_page, PAGE_SIZE);
+
+    PAGING_DEBUG(COW,
+                 "copied page: mfn %" PRI_mfn
+                 "; pfn %lx; page first chunk (%lx, %lx)  from dom %d\n",
+                 guest_mfn, pfn, *((unsigned long*)guest_page),
+                 *((unsigned long*)buffer_page), d->domain_id);
+
+    /* Unmap pages */
+    unmap_domain_page(buffer_page);
+
+    /* Replace mfn in ring with pfn */
+    cow_ring_lock(d);
+
+    back_ring = &d->arch.paging.cow.back_ring;
+
+    rsp.pfn = pfn;
+    memcpy(RING_GET_RESPONSE(back_ring, back_ring->rsp_prod_pvt),
+           &rsp, sizeof(rsp));
+
+    /* Update number of pages copied */
+    back_ring->rsp_prod_pvt++;
+    RING_PUSH_RESPONSES(back_ring);
+
+    cow_ring_unlock(d);
+}
+
+/* Save a page into a buffer */
+static void cow_save_page(struct domain *d, unsigned long guest_mfn)
+{
+    void *guest_page;
+
+    guest_page = map_domain_page(guest_mfn);
+    cow_copy_page(d, guest_mfn, guest_page);
+    unmap_domain_page(guest_page);
+}
+
+static void cow_pause_domain(struct domain *d, unsigned long guest_mfn,
+                                    bool_t is_pre_dirty)
+{
+#if !COW_DOMAIN_PAUSE
+    struct vcpu *v;
+#endif
+
+    if ( d->arch.paging.cow.is_paused )
+    {
+        PAGING_DEBUG(COW,
+                     "domain already paused domain %d; mfn: %" PRI_mfn "\n",
+                     d->domain_id, guest_mfn);
+        return;
+    }
+
+    d->arch.paging.cow.is_paused = 1;
+    d->arch.paging.cow.is_paused_pre_dirty = is_pre_dirty;
+    d->arch.paging.cow.paused_guest_mfn = guest_mfn;
+
+    PAGING_DEBUG(COW,
+                 "not enough buffer space, pausing domain %d; mfn: %"
+                 PRI_mfn "\n", d->domain_id, guest_mfn);
+
+    printk("cow: pausing domain\n");
+
+#if COW_DOMAIN_PAUSE
+    domain_pause(d);
+#else
+    atomic_inc(&d->pause_count);
+
+    for_each_vcpu( d, v )
+//        vcpu_pause_nosync(v);
+        vcpu_sleep_nosync(v);
+#endif
+
+    tasklet_schedule(&cow_notify_dom0_pause_tasklet);
+}
+
+static int cow_new_snapshot(struct domain *d)
+{
+    int ret;
+
+    ret = paging_alloc_cow_bitmap(&d->arch.paging.cow.precow_foreign_bitmap,
+                                  d->arch.paging.cow.bitmap_size);
+    if ( unlikely(ret != 0) )
+        goto free_log_dirty;
+
+    ret = paging_alloc_cow_bitmap(&d->arch.paging.cow.bitmap,
+                                  d->arch.paging.cow.bitmap_size);
+    if ( unlikely(ret != 0) )
+        goto free_precow_foreign;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow enabled for dom %d\n", d->domain_id);
+#endif
+
+    return 0;
+
+ free_precow_foreign:
+    paging_free_cow_bitmap(&d->arch.paging.cow.precow_foreign_bitmap);
+ free_log_dirty:
+    /* FIXME: This probably shouldn't be here any more... */
+    paging_free_log_dirty_bitmap(d);
+    return ret;
+}
+
+/* Check to make sure there's enough space in the buffer to continue */
+static int cow_check_threshold(struct domain *d)
+{
+    RING_IDX req_prod;
+    RING_IDX req_cons;
+    RING_IDX free_slots;
+
+    req_prod = d->arch.paging.cow.back_ring.sring->req_prod;
+    req_cons = d->arch.paging.cow.back_ring.req_cons;
+
+    if ( unlikely(d->arch.paging.cow.is_paused) )
+    {
+        printk("cow_paging: check_threshold: domain still paused\n");
+        return -EBUSY;
+    }
+
+    free_slots = req_prod - req_cons;
+
+    if ( unlikely(free_slots <  XEN_COW_RING_THRESHOLD) )
+    {
+        printk("cow_paging: check_threshold: no space left: req_prod = %d;"
+               "req_cons = %d; free_slots = %d\n",
+               req_prod, req_cons, free_slots);
+        return -ENOSPC;
+    }
+
+    /* Notify ring buffer consumer that we've crossed the high water mark */
+    if ( !d->arch.paging.cow.notified_high_water
+         && (free_slots < d->arch.paging.cow.ring_high_water) )
+    {
+#if COW_DEBUG_OUTPUT
+        printk("cow: check_threshold: passed high water mark\n");
+#endif
+
+        d->arch.paging.cow.notified_high_water = 1;
+        cow_notify_dom0_high_water_tasklet.data = (unsigned 
long)&d->arch.paging.cow.notified_high_water;
+        tasklet_schedule(&cow_notify_dom0_high_water_tasklet);
+    }
+
+    return 0;
+}
+
+static int cow_save_state(struct domain *d)
+{
+    struct vcpu *v;
+    void *vcpu_page;
+    int i = 0;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: save state: start\n");
+#endif
+
+    vcpu_page = xmalloc_bytes(PAGE_SIZE);
+    if ( unlikely(vcpu_page == NULL) )
+        return -ENOMEM;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: save state: allocated page\n");
+#endif
+
+    memset(vcpu_page, 0, PAGE_SIZE);
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: save state: cleared page\n");
+#endif
+
+    /* Save state for each vcpu */
+    for_each_vcpu(d, v)
+    {
+        void *p = vcpu_page + (i * sizeof(v->arch.guest_context.user_regs));
+        memcpy(p, &v->arch.guest_context.user_regs,
+               sizeof(v->arch.guest_context.user_regs));
+
+        i++;
+    }
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: save state: copied CPU info\n");
+#endif
+
+#if 0
+    cow_copy_page(d, STATE_MFN, d->shared_info);
+#endif
+    cow_copy_page(d, STATE_MFN, vcpu_page);
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: save state: copied pages\n");
+#endif
+
+    xfree(vcpu_page);
+
+#if COW_DEBUG_OUTPUT
+    printk("cow: save state: done\n");
+#endif
+
+    return 0;
+}
+
+static int cow_scan_foreign_mapping(struct domain *d)
+{
+    RING_IDX req_prod;
+    RING_IDX req_cons;
+    int free_slots;
+    unsigned int num_entries;
+    unsigned int i;
+    int ret;
+
+    ASSERT(d->arch.paging.cow.precow_foreign_bitmap != NULL);
+
+    /* Get the grant table lock */
+    spin_lock(&d->grant_table->lock);
+
+    /* Find active entires */
+    num_entries = 0;
+    for ( i = 0; i < nr_grant_entries(d->grant_table); i++ )
+    {
+        struct active_grant_entry *act = &active_entry(d->grant_table, i);
+
+        /* XXX: Is pin guaranteed to be zero for an inactive grant? */
+        /* XXX: Do I need to worry about device mapping? */
+        if ( act->pin & GNTPIN_hstw_mask || act->pin & GNTPIN_devw_mask )
+            num_entries++;
+    }
+
+    /* Make sure there's enough buffer space for this */
+    req_prod = d->arch.paging.cow.back_ring.sring->req_prod;
+    req_cons = d->arch.paging.cow.back_ring.req_cons;
+    free_slots = req_prod - req_cons;
+
+    ret = -ENOSPC;
+    if ( unlikely(free_slots <  num_entries + XEN_COW_RING_THRESHOLD) )
+    {
+        printk("cow_paging: scan_foreign: not enough space left\n");
+        d->arch.paging.cow.is_paused_scan_foreign = 1;
+        goto out;
+    }
+
+    /* For each entry in the active list, save the page */
+    for ( i = 0; i < nr_grant_entries(d->grant_table); i++ )
+    {
+        struct active_grant_entry *act = &active_entry(d->grant_table, i);
+
+        if ( act->pin & GNTPIN_hstw_mask || act->pin & GNTPIN_devw_mask )
+        {
+            mfn_t gmfn;
+            unsigned long pfn;
+
+            gmfn = _mfn(act->frame);
+
+            /* We /really/ mean PFN here, even for non-translated guests. */
+            pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+            ASSERT(VALID_M2P(pfn));
+            ASSERT(mfn_valid(gmfn));
+
+            /* Set the bit in the precow bitmap */
+#if 1
+            __set_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap);
+#else
+            set_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap);
+#endif
+
+            /*
+             * If we have mapping with other domain, we won't be able
+             * to coordinate with them so just save page to be safe
+             */
+            cow_save_page(d, act->frame);
+        }
+    }
+
+    ret = 0;
+ out:
+    /* Release lock */
+    spin_unlock(&d->grant_table->lock);
+    return ret;
+}
+
+static int cow_take_snapshot(struct domain *d)
+{
+    int ret;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow checking threshold\n");
+#endif
+
+    ret = cow_check_threshold(d);
+    if ( unlikely(ret != 0) )
+        return ret;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow new snapshot\n");
+#endif
+
+    ret = cow_new_snapshot(d);
+    if ( unlikely(ret != 0) )
+        return ret;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow saving state\n");
+#endif
+
+    ret = cow_save_state(d);
+    if ( unlikely(ret != 0) )
+        return ret;
+
+#if COW_DEBUG_OUTPUT
+    printk("cow scan foreign\n");
+#endif
+
+    /*
+     * Fill the precow bitmap by scanning the active grant list.
+     * We are racing with devices here, so we need to coordinate
+     * We will probably only coordinate with dom0.
+     */
+    cow_scan_foreign_mapping(d);
+
+#if COW_DEBUG_OUTPUT
+    printk("cow snapshot taken\n");
+#endif
+
+    return 0;
+}
+
+void cow_init(struct domain *d)
+{
+    cow_lock_init(d);
+    cow_ring_lock_init(d);
+    disable_cow(d);
+}
+
+void cow_teardown(struct domain *d)
+{
+    cow_lock(d);
+    paging_free_cow(d);
+    cow_unlock(d);
+}
+
+int cow_enable(struct domain *d)
+{
+    int ret;
+
+    cow_lock(d);
+
+    ret = -EINVAL;
+    if ( cow_enabled(d) )
+        goto out;
+
+    d->arch.paging.cow.bitmap_size =
+        (domain_get_maximum_gpfn(d) + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
+
+    /* 50% high water mark */
+    d->arch.paging.cow.ring_high_water = 
RING_SIZE(&d->arch.paging.cow.back_ring) >> 1;
+    d->arch.paging.cow.notified_high_water = 0; 
+
+    d->arch.paging.cow.is_paused = 0;
+    d->arch.paging.cow.is_paused_pre_dirty = 0;
+    d->arch.paging.cow.is_paused_scan_foreign = 0;
+    d->arch.paging.cow.paused_guest_mfn = 0;
+
+    enable_cow(d);
+
+    ret = 0;
+
+ out:
+    cow_unlock(d);
+    return ret;
+}
+
+void cow_disable(struct domain *d)
+{
+    printk("cow: disable cow for domain %d\n", d->domain_id);
+
+    disable_cow(d);
+
+    cow_lock(d);
+    paging_free_cow(d);
+    cow_unlock(d);
+}
+
+/* Take proper action when a page is mapped writable in a foreign domain */
+void cow_pre_dirty(struct domain *d, unsigned long guest_mfn)
+{
+    unsigned long pfn;
+    mfn_t gmfn;
+    int rc;
+
+    gmfn = _mfn(guest_mfn);
+
+    /* We /really/ mean PFN here, even for non-translated guests. */
+    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+    BUG_ON(!VALID_M2P(pfn));
+
+    cow_lock(d);
+
+    ASSERT(d->arch.paging.cow.precow_foreign_bitmap != NULL);
+    ASSERT(d->arch.paging.cow.bitmap != NULL);
+    BUG_ON( test_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap) );
+
+    if ( !__test_and_set_bit(pfn, d->arch.paging.cow.bitmap) )
+    {
+#if COW_DEBUG_OUTPUT
+        printk("cow: pre dirty: mfn = %lx\n", guest_mfn);
+#endif
+
+        rc = cow_check_threshold(d);
+        if ( rc != 0 )
+        {
+            __clear_bit(pfn, d->arch.paging.cow.bitmap);
+            cow_pause_domain(d, guest_mfn, 1);
+            goto out;
+        }
+
+        cow_save_page(d, guest_mfn);
+    }
+
+ out:
+    cow_unlock(d);
+}
+
+void cow_mark_dirty(struct domain *d, unsigned long guest_mfn)
+{
+    unsigned long pfn;
+    mfn_t gmfn;
+    int rc;
+
+    cow_lock(d);
+
+    ASSERT(d->arch.paging.cow.precow_foreign_bitmap != NULL);
+    ASSERT(d->arch.paging.cow.bitmap != NULL);
+
+    gmfn = _mfn(guest_mfn);
+
+    /* We /really/ mean PFN here, even for non-translated guests. */
+    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+    /*
+     * Values with the MSB set denote MFNs that aren't really part of the
+     * domain's pseudo-physical memory map (e.g., the shared info frame).
+     * Nothing to do here...
+     */
+    if ( unlikely(!VALID_M2P(pfn)) )
+        goto out;
+
+    /* Test saved_page bitmap */
+#if 0
+#if COW_DEBUG_OUTPUT
+    if ( test_bit(pfn, d->arch.paging.cow.bitmap) )
+        printk("cow: already marked dirty: mfn = %lx\n", guest_mfn);
+#endif
+#endif
+
+    /* Test precow bitmap */
+    if ( test_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap) )
+    {
+        /*
+         * This is either a ring page(ok) or
+         * the guest is racing with a device to
+         * write to the page but since we are racing
+         * with a device, we can really save the page either.
+         * In the latter case, the checkpoint will most
+         * likely be broken.
+         */
+        PAGING_DEBUG(COW,
+                     "write to precow foreign page %" PRI_mfn
+                     " (pfn=%lx), dom %d\n",
+                     mfn_x(gmfn), pfn, d->domain_id);
+
+        /*
+         * We clear this bit, since the state of the page is now defined
+         * and part of the snapshot, so we want to protect the page if we
+         * write to it.
+         */
+        __clear_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap);
+
+        if ( !__test_and_set_bit(pfn, d->arch.paging.cow.bitmap) )
+        {
+            PAGING_DEBUG(COW,
+                         "marked precow foreign mfn %"
+                         PRI_mfn " (pfn=%lx), dom %d\n",
+                         mfn_x(gmfn), pfn, d->domain_id);
+        }
+    }
+    else if ( !__test_and_set_bit(pfn, d->arch.paging.cow.bitmap) )
+    {
+        /* Save the page */
+#if COW_DEBUG_OUTPUT
+        printk("cow: mark dirty: mfn = %lx\n", guest_mfn);
+#endif
+        rc = cow_check_threshold(d);
+        if ( rc != 0 )
+        {
+            __clear_bit(pfn, d->arch.paging.cow.bitmap);
+            cow_pause_domain(d, guest_mfn, 0);
+            goto out;
+        }
+
+        cow_save_page(d, guest_mfn);
+    }
+
+ out:
+    cow_unlock(d);
+}
+
+int cow_snapshot(struct domain *d, RING_IDX *when)
+{
+    int ret;
+
+    /* FIXME: Try not to pause/unpause all the time */
+    domain_pause(d);
+
+    /* FIXME: Try not to disable/enable log dirty all the time */
+    if ( cow_enabled(d) )
+        paging_log_dirty_disable(d);
+
+    if ( !cow_enabled(d) )
+    {
+        ret = paging_log_dirty_enable(d, 1);
+        if ( ret != 0 )
+            goto out;
+    }
+
+    cow_lock(d);
+    *when = d->arch.paging.cow.back_ring.sring->rsp_prod;
+#if COW_DEBUG_OUTPUT
+        printk("cow: snapshot: when = %d\n", *when);
+#endif
+    ret = cow_take_snapshot(d);
+#if COW_DEBUG_OUTPUT
+        printk("cow: snapshot: took snapshot = %d\n", ret);
+#endif
+    cow_unlock(d);
+
+ out:
+    domain_unpause(d);
+
+    return ret;
+}
+
+int cow_resume(struct domain *d)
+{
+#if !COW_DOMAIN_PAUSE
+    struct vcpu *v;
+#endif
+    int ret;
+
+    cow_lock(d);
+    
+    ret = -EINVAL;
+    if ( !cow_enabled(d) )
+        goto out;
+
+    if ( d->arch.paging.cow.is_paused == 0 )
+    {
+        ret = 0;
+        goto out;
+    }
+
+    d->arch.paging.cow.is_paused = 0;
+    ret = cow_check_threshold(d);
+    if ( ret != 0 )
+    {
+        d->arch.paging.cow.is_paused = 1;
+        goto out;
+    }
+
+    if ( d->arch.paging.cow.is_paused_pre_dirty )
+    {
+        d->arch.paging.cow.is_paused_pre_dirty = 0;
+        cow_unlock(d);
+        cow_pre_dirty(d, d->arch.paging.cow.paused_guest_mfn);
+    }
+    else if ( d->arch.paging.cow.is_paused_scan_foreign )
+    {
+        d->arch.paging.cow.is_paused_scan_foreign = 0;
+        cow_scan_foreign_mapping(d);
+        cow_unlock(d);
+    }
+    else
+    {
+        cow_unlock(d);
+        cow_mark_dirty(d, d->arch.paging.cow.paused_guest_mfn);
+    }
+
+    d->arch.paging.cow.paused_guest_mfn = 0;
+
+#if COW_DOMAIN_PAUSE
+    domain_unpause(d);
+#else
+    if ( atomic_dec_and_test(&d->pause_count) )
+        for_each_vcpu( d, v )
+//            vcpu_unpause(v);
+            vcpu_wake(v);
+#endif
+
+    return 0;
+
+ out:
+    cow_unlock(d);
+    return ret;
+}
diff -r 0477f9061c8a xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c  Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/mm/paging.c  Mon Apr 20 10:21:49 2009 -0700
@@ -26,8 +26,10 @@
 #include <asm/p2m.h>
 #include <asm/hap.h>
 #include <asm/guest_access.h>
+#include <asm/cow.h>
 #include <xen/numa.h>
 #include <xsm/xsm.h>
+#include <xen/grant_table.h>
 
 #define hap_enabled(d) (is_hvm_domain(d) && (d)->arch.hvm_domain.hap_enabled)
 
@@ -158,7 +160,7 @@
 {
     d->arch.paging.log_dirty.allocs--;
     free_domheap_page(mfn_to_page(mfn));
-}    
+}
 
 void paging_free_log_dirty_bitmap(struct domain *d)
 {
@@ -207,7 +209,7 @@
     d->arch.paging.log_dirty.failed_allocs = 0;
 }
 
-int paging_log_dirty_enable(struct domain *d)
+int paging_log_dirty_enable(struct domain *d, bool_t enable_cow)
 {
     int ret;
 
@@ -226,6 +228,9 @@
         paging_free_log_dirty_bitmap(d);
         goto out;
     }
+
+    if ( enable_cow )
+        cow_enable(d);
 
     log_dirty_unlock(d);
 
@@ -253,11 +258,33 @@
     ret = d->arch.paging.log_dirty.disable_log_dirty(d);
     log_dirty_lock(d);
     if ( !paging_mode_log_dirty(d) )
+    {
         paging_free_log_dirty_bitmap(d);
+
+        if ( cow_enabled(d) )
+            cow_disable(d);
+    }
     log_dirty_unlock(d);
     domain_unpause(d);
 
     return ret;
+}
+
+void paging_pre_dirty(struct domain *d, unsigned long guest_mfn)
+{
+    mfn_t gmfn;
+
+    gmfn = _mfn(guest_mfn);
+
+    if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) )
+        return;
+
+    log_dirty_lock(d);
+    
+    if ( cow_enabled(d) )
+        cow_pre_dirty(d, guest_mfn);
+
+    log_dirty_unlock(d);
 }
 
 /* Mark a page as dirty */
@@ -327,11 +354,14 @@
     unmap_domain_page(l1);
     if ( changed )
     {
-        PAGING_DEBUG(LOGDIRTY, 
+        PAGING_DEBUG(LOGDIRTY,
                      "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n",
                      mfn_x(gmfn), pfn, d->domain_id);
         d->arch.paging.log_dirty.dirty_count++;
     }
+
+    if ( cow_enabled(d) )
+        cow_mark_dirty(d, guest_mfn);
 
  out:
     log_dirty_unlock(d);
@@ -471,13 +501,20 @@
     d->arch.paging.log_dirty.disable_log_dirty = disable_log_dirty;
     d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap;
     d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
+
+    cow_init(d);
 }
 
 /* This function fress log dirty bitmap resources. */
 void paging_log_dirty_teardown(struct domain*d)
 {
     log_dirty_lock(d);
+
     paging_free_log_dirty_bitmap(d);
+
+    if ( cow_enabled(d) )
+        cow_teardown(d);
+
     log_dirty_unlock(d);
 }
 /************************************************/
@@ -552,11 +589,11 @@
     switch ( sc->op )
     {
     case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
-        return paging_log_dirty_enable(d);
+        return paging_log_dirty_enable(d, 0);
 
     case XEN_DOMCTL_SHADOW_OP_ENABLE:
         if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
-            return paging_log_dirty_enable(d);
+            return paging_log_dirty_enable(d, 0);
 
     case XEN_DOMCTL_SHADOW_OP_OFF:
         if ( paging_mode_log_dirty(d) )
diff -r 0477f9061c8a xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c    Mon Apr 20 10:21:49 2009 -0700
@@ -36,6 +36,7 @@
 #include <asm/hvm/cacheattr.h>
 #include <asm/mtrr.h>
 #include <asm/guest_pt.h>
+#include <asm/paging.h>
 #include "private.h"
 #include "types.h"
 
@@ -4598,6 +4599,8 @@
     }
 #endif
                 
+    paging_pre_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
+
     /* Unaligned writes mean probably this isn't a pagetable */
     if ( vaddr & (bytes - 1) )
         sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
@@ -4623,6 +4626,8 @@
                     MAPPING_EXCEPTION :
                     (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
                     MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
+
+        paging_pre_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
 
         /* Cross-page writes mean probably not a pagetable */
         sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
diff -r 0477f9061c8a xen/common/grant_table.c
--- a/xen/common/grant_table.c  Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/common/grant_table.c  Mon Apr 20 10:21:49 2009 -0700
@@ -352,6 +352,15 @@
             goto undo_out;
         }
 
+        /*
+         * If the mapping is writable, do something before the page is mapped.
+         * We may end up doing something useless if the mapping fails but
+         * otherwise we could end up racing with the guest
+         * (altough very unlikely)
+         */
+        if ( !(op->flags & GNTMAP_readonly) )
+            gnttab_pre_dirty(rd, frame);
+
         rc = create_grant_host_mapping(
             op->host_addr, frame, op->flags, cache_flags);
         if ( rc != GNTST_okay )
@@ -582,7 +591,7 @@
 
     /* If just unmapped a writable mapping, mark as dirtied */
     if ( !(op->flags & GNTMAP_readonly) )
-         gnttab_mark_dirty(rd, op->frame);
+         gnttab_post_dirty(rd, op->frame);
 
  unmap_out:
     op->status = rc;
@@ -1255,7 +1264,7 @@
     }
     else
     {
-        gnttab_mark_dirty(rd, r_frame);
+        gnttab_post_dirty(rd, r_frame);
 
         act->pin -= GNTPIN_hstw_inc;
         if ( !(act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) )
@@ -1444,6 +1453,8 @@
         goto error_out;
     }
 
+    gnttab_pre_dirty(dd, d_frame);
+
     sp = map_domain_page(s_frame);
     dp = map_domain_page(d_frame);
 
@@ -1452,7 +1463,7 @@
     unmap_domain_page(dp);
     unmap_domain_page(sp);
 
-    gnttab_mark_dirty(dd, d_frame);
+    gnttab_post_dirty(dd, d_frame);
 
     put_page_and_type(mfn_to_page(d_frame));
  error_out:
diff -r 0477f9061c8a xen/include/asm-x86/cow.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/cow.h Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,83 @@
+/******************************************************************************
+ * include/asm-x86/cow.h
+ *
+ * Common interface for cow support.
+ * 
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ * Parts based on earlier work by Geoffrey Lefebvre
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#ifndef __COW_H__
+#define __COW_H__
+
+
+#include <xen/sched.h>
+
+
+/* Flag used for CoW debug */
+#define PAGING_DEBUG_COW  1
+
+
+/* CoW helper functions */
+#define cow_enabled(_d) ((_d)->is_cow)
+#define enable_cow(_d) ((_d)->is_cow = 1)
+#define disable_cow(_d) ((_d)->is_cow = 0)
+
+/* CoW lock */
+#define cow_ring_lock_init(_d)  
spin_lock_init(&(_d)->arch.paging.cow.ring_lock)
+#define cow_ring_lock(_d)       spin_lock(&(_d)->arch.paging.cow.ring_lock)
+#define cow_ring_unlock(_d)     spin_unlock(&(_d)->arch.paging.cow.ring_lock)
+
+
+/* Enable CoW */
+int cow_enable(struct domain *d);
+
+/* Disable CoW */
+void cow_disable(struct domain *d);
+
+/* CoW initialisation */
+void cow_init(struct domain *d);
+
+/* CoW teardown */
+void cow_teardown(struct domain *d);
+
+/* Take a CoW snapshot */
+int cow_snapshot(struct domain *d, RING_IDX *when);
+
+/* Resume a domain paused because of CoW (buffer was full) */
+int cow_resume(struct domain *d);
+
+/* We use the mapping and unmaping of the page as conservative boundary
+ * on the page being written to by the foreign domain */
+void cow_pre_dirty(struct domain *d, unsigned long guest_mfn);
+
+/* Copy pages out and mark them as dirty so they don't get copied again */
+void cow_mark_dirty(struct domain *d, unsigned long guest_mfn);
+
+
+#endif /* __COW_H__ */
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/asm-x86/domain.h      Mon Apr 20 10:21:49 2009 -0700
@@ -6,6 +6,7 @@
 #include <asm/hvm/vcpu.h>
 #include <asm/hvm/domain.h>
 #include <asm/e820.h>
+#include <public/io/cow.h>
 
 #define has_32bit_shinfo(d)    ((d)->arch.has_32bit_shinfo)
 #define is_pv_32bit_domain(d)  ((d)->arch.is_32bit_pv)
@@ -149,6 +150,41 @@
 };
 
 /************************************************/
+/*                 copy-on-write                */
+/************************************************/
+struct cow_domain {
+    /* cow lock */
+    spinlock_t          lock;
+    int                 locker; /* processor that holds the lock */
+    const char         *locker_function; /* func that took it */
+
+    /* ring lock */
+    spinlock_t          ring_lock;
+
+    /* size of the cow bitmaps */
+    unsigned long       bitmap_size;
+
+    /* cow bitmap to record foreign pages before cow was enabled */
+    unsigned long      *precow_foreign_bitmap;
+
+    /* cow bitmap to record pages that have been saved */
+    unsigned long      *bitmap;
+
+    /* back-end ring for reading mfns and storing pfns */
+    cow_back_ring_t     back_ring;
+
+    /* high water mark for ring */
+    RING_IDX            ring_high_water;
+    bool_t              notified_high_water;
+
+    /* paused domain */
+    bool_t              is_paused;
+    bool_t              is_paused_pre_dirty;
+    bool_t              is_paused_scan_foreign;
+    unsigned long       paused_guest_mfn;
+};
+
+/************************************************/
 /*       common paging data structure           */
 /************************************************/
 struct log_dirty_domain {
@@ -181,6 +217,8 @@
     struct hap_domain       hap;
     /* log dirty support */
     struct log_dirty_domain log_dirty;
+    /* cow support */
+    struct cow_domain cow;
 };
 
 struct paging_vcpu {
diff -r 0477f9061c8a xen/include/asm-x86/grant_table.h
--- a/xen/include/asm-x86/grant_table.h Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/asm-x86/grant_table.h Mon Apr 20 10:21:49 2009 -0700
@@ -31,7 +31,8 @@
 #define gnttab_shared_gmfn(d, t, i)                     \
     (mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i)))
 
-#define gnttab_mark_dirty(d, f) paging_mark_dirty((d), (f))
+#define gnttab_pre_dirty(d, f)  paging_pre_dirty((d), (f))
+#define gnttab_post_dirty(d, f) paging_mark_dirty((d), (f))
 
 static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr)
 {
diff -r 0477f9061c8a xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h      Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/asm-x86/paging.h      Mon Apr 20 10:21:49 2009 -0700
@@ -140,7 +140,7 @@
 void paging_free_log_dirty_bitmap(struct domain *d);
 
 /* enable log dirty */
-int paging_log_dirty_enable(struct domain *d);
+int paging_log_dirty_enable(struct domain *d, bool_t enable_cow);
 
 /* disable log dirty */
 int paging_log_dirty_disable(struct domain *d);
@@ -152,6 +152,7 @@
                            void (*clean_dirty_bitmap)(struct domain *d));
 
 /* mark a page as dirty */
+void paging_pre_dirty(struct domain *d, unsigned long guest_mfn);
 void paging_mark_dirty(struct domain *d, unsigned long guest_mfn);
 
 /*
diff -r 0477f9061c8a xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/public/domctl.h       Mon Apr 20 10:21:49 2009 -0700
@@ -33,6 +33,7 @@
 #endif
 
 #include "xen.h"
+#include "io/ring.h"
 
 #define XEN_DOMCTL_INTERFACE_VERSION 0x00000005
 
@@ -645,6 +646,41 @@
 } xen_domctl_hvmcontext_partial_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
 
+/* FIXME: use types instead of different domctls */
+/*
+ * Enable/disable Copy-on-write for a domain.
+ */
+#define XEN_DOMCTL_cow_enable       56
+#define XEN_DOMCTL_cow_snapshot     57
+#define XEN_DOMCTL_cow_resume       58
+#define XEN_DOMCTL_cow_disable      59
+#define XEN_DOMCTL_cow_page_type    60
+
+struct xen_domctl_cow_enable {
+    /* IN: mfn of the ring buffer */
+    unsigned long mfn;
+};
+typedef struct xen_domctl_cow_enable xen_domctl_cow_enable_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cow_enable_t);
+
+struct xen_domctl_cow_snapshot {
+    /* OUT: when the snapshot took place (rsp_prod) */
+    RING_IDX when;
+};
+typedef struct xen_domctl_cow_snapshot xen_domctl_cow_snapshot_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cow_snapshot_t);
+
+struct xen_domctl_cow_page_type {
+    /* IN: mfn of the page */
+    unsigned long mfn;
+    /* OUT: count info */
+    unsigned long count_info;
+    /* OUT: page type info */
+    unsigned long type_info;
+};
+typedef struct xen_domctl_cow_page_type xen_domctl_cow_page_type_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cow_page_type_t);
+
 
 struct xen_domctl {
     uint32_t cmd;
@@ -687,6 +723,9 @@
         struct xen_domctl_set_target        set_target;
         struct xen_domctl_subscribe         subscribe;
         struct xen_domctl_debug_op          debug_op;
+        struct xen_domctl_cow_enable        cow_enable;
+        struct xen_domctl_cow_snapshot      cow_snapshot;
+        struct xen_domctl_cow_page_type     cow_page_type;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
 #endif
diff -r 0477f9061c8a xen/include/public/io/cow.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/io/cow.h       Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,82 @@
+/*****************************************************************************
+ * cow.h
+ *
+ * CoW common structures
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (C) 2009 University of British Columbia (Patrick Colp)
+ */
+
+#ifndef _XEN_PUBLIC_IO_COW_H
+#define _XEN_PUBLIC_IO_COW_H
+
+
+#include "ring.h"
+
+
+#define RING_MASK(_r, _i)  ((_i) & (RING_SIZE(_r) - 1))
+
+
+#define XEN_COW_IOC_MAGIC   'w'
+#define XEN_COW_IOCTL_INIT  _IO(XEN_COW_IOC_MAGIC, 1)
+
+#define XEN_COW_RING_PAGES  1 /* TODO: 2+ pages? */
+#define XEN_COW_RING_SIZE   (XEN_COW_RING_PAGES << PAGE_SHIFT)
+
+#define XEN_COW_RING_THRESHOLD  16
+
+
+/* Some definitions for the XenCow ring buffer. */
+typedef struct cow_request_st {
+   unsigned long mfn;
+} cow_request_t;
+
+typedef struct cow_response_st {
+    unsigned long pfn;
+} cow_response_t;
+
+
+DEFINE_RING_TYPES(cow, cow_request_t, cow_response_t);
+
+
+/*
+ * The structure used to initialise CoW.
+ */
+typedef struct cow_init_st {
+    /* Start address of buffer */
+    unsigned long addr;
+    /* Number of frames in buffer */
+    int num_mfns;
+    /* MFNs of buffer frames */
+    unsigned long mfns[];
+} cow_init_t;
+
+
+#endif /* _XEN_PUBLIC_IO_COW_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a xen/include/public/xen.h
--- a/xen/include/public/xen.h  Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/public/xen.h  Mon Apr 20 10:21:49 2009 -0700
@@ -143,6 +143,8 @@
 #define VIRQ_DEBUGGER   6  /* G. (DOM0) A domain has paused for debugging.   */
 #define VIRQ_XENOPROF   7  /* V. XenOprofile interrupt: new sample available */
 #define VIRQ_CON_RING   8  /* G. (DOM0) Bytes received on console            */
+#define VIRQ_COW_BUFFER 9  /* G. (DOM0) CoW buffer has pages available       */
+#define VIRQ_COW_PAUSE  10 /* G. (DOM0) CoW domain has been paused           */
 
 /* Architecture-specific VIRQ definitions. */
 #define VIRQ_ARCH_0    16
diff -r 0477f9061c8a xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/xen/sched.h   Mon Apr 20 10:21:49 2009 -0700
@@ -223,6 +223,8 @@
     bool_t           is_paused_by_controller;
     /* Domain's VCPUs are pinned 1:1 to physical CPUs? */
     bool_t           is_pinned;
+    /* Is this guest doing CoW? */
+    bool_t           is_cow;
 
     /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
     DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS);
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
 |