WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] merge with xen-unstable.hg

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] merge with xen-unstable.hg
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Wed, 02 Jul 2008 05:40:26 -0700
Delivery-date: Wed, 02 Jul 2008 05:41:06 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
# Date 1214965837 -32400
# Node ID 08f77df14cba8e2dfe580779bb9ca2f64e1ae0ae
# Parent  11318234588e61b45df5a06fe6a29264854ba22a
# Parent  19970181d6a46aee1199857b6d3c6bedc7507121
merge with xen-unstable.hg
---
 docs/ChangeLog                               |    9 
 extras/mini-os/arch/x86/mm.c                 |   11 
 extras/mini-os/blkfront.c                    |    1 
 extras/mini-os/fbfront.c                     |    2 
 extras/mini-os/fs-front.c                    |   10 
 extras/mini-os/lib/sys.c                     |    2 
 extras/mini-os/netfront.c                    |    6 
 stubdom/grub.patches/99minios                |   10 
 stubdom/grub/Makefile                        |    2 
 tools/blktap/drivers/Makefile                |   10 
 tools/blktap/drivers/blktapctrl.c            |    2 
 tools/blktap/drivers/block-qcow.c            |   35 +
 tools/blktap/drivers/block-qcow2.c           |    5 
 tools/blktap/drivers/check_gcrypt            |   14 
 tools/blktap/lib/blktaplib.h                 |    2 
 tools/debugger/xenitp/xenitp.c               |   24 
 tools/examples/xend-config.sxp               |    3 
 tools/firmware/hvmloader/hvmloader.c         |   10 
 tools/firmware/rombios/rombios.c             |   35 -
 tools/ioemu/hw/xen_console.c                 |    8 
 tools/ioemu/target-i386-dm/exec-dm.c         |   17 
 tools/ioemu/xenstore.c                       |   11 
 tools/libxc/ia64/xc_ia64_hvm_build.c         |    7 
 tools/libxc/ia64/xc_ia64_linux_restore.c     |   24 
 tools/libxc/ia64/xc_ia64_linux_save.c        |   19 
 tools/libxc/xc_core.c                        |    8 
 tools/libxc/xc_core_ia64.c                   |    3 
 tools/libxc/xc_core_ia64.h                   |    2 
 tools/libxc/xc_domain.c                      |   65 --
 tools/libxc/xc_domain_restore.c              |   12 
 tools/libxc/xc_domain_save.c                 |   20 
 tools/libxc/xc_misc.c                        |   28 
 tools/libxc/xc_pagetab.c                     |    4 
 tools/libxc/xc_private.h                     |    4 
 tools/libxc/xc_ptrace.c                      |   34 -
 tools/libxc/xc_ptrace_core.c                 |    8 
 tools/libxc/xc_resume.c                      |   10 
 tools/libxc/xenctrl.h                        |   44 +
 tools/libxc/xg_save_restore.h                |   22 
 tools/python/xen/util/blkif.py               |   41 -
 tools/python/xen/xend/XendConfig.py          |    2 
 tools/python/xen/xend/XendOptions.py         |    7 
 tools/python/xen/xend/image.py               |   20 
 tools/python/xen/xend/server/blkif.py        |    6 
 tools/python/xen/xm/main.py                  |    3 
 tools/tests/test_x86_emulator.c              |    9 
 tools/xenballoon/xenballoon-monitor          |   43 +
 tools/xenballoon/xenballoon.conf             |   91 +++
 tools/xenballoon/xenballoond                 |  205 ++++++
 tools/xenballoon/xenballoond.README          |   82 ++
 tools/xenballoon/xenballoond.init            |   91 +++
 tools/xentrace/xenctx.c                      |    8 
 tools/xm-test/lib/XmTestLib/block_utils.py   |    2 
 xen/arch/ia64/vmx/vmx_hypercall.c            |   47 +
 xen/arch/ia64/xen/mm.c                       |    6 
 xen/arch/x86/acpi/cpufreq/Makefile           |    1 
 xen/arch/x86/acpi/cpufreq/cpufreq.c          |  139 +++-
 xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c |   14 
 xen/arch/x86/acpi/cpufreq/powernow.c         |  305 ++++++++++
 xen/arch/x86/acpi/cpufreq/utility.c          |  103 +++
 xen/arch/x86/acpi/pmstat.c                   |    7 
 xen/arch/x86/acpi/power.c                    |   25 
 xen/arch/x86/hvm/emulate.c                   |  113 +--
 xen/arch/x86/hvm/hvm.c                       |   60 +
 xen/arch/x86/hvm/vmx/vmcs.c                  |  100 +--
 xen/arch/x86/hvm/vmx/vmx.c                   |   11 
 xen/arch/x86/hvm/vmx/vpmu_core2.c            |   20 
 xen/arch/x86/mm.c                            |   45 +
 xen/arch/x86/mm/shadow/common.c              |  811 ++++++++++++++++++++++++++-
 xen/arch/x86/mm/shadow/multi.c               |  559 +++++++++++++++++-
 xen/arch/x86/mm/shadow/multi.h               |   14 
 xen/arch/x86/mm/shadow/private.h             |  130 ++++
 xen/arch/x86/mm/shadow/types.h               |    5 
 xen/arch/x86/platform_hypercall.c            |    7 
 xen/arch/x86/x86_emulate/x86_emulate.c       |  700 ++++++++++++++++++-----
 xen/arch/x86/x86_emulate/x86_emulate.h       |   37 -
 xen/common/domain.c                          |  259 ++++----
 xen/drivers/passthrough/vtd/dmar.c           |    3 
 xen/drivers/passthrough/vtd/dmar.h           |   16 
 xen/drivers/passthrough/vtd/intremap.c       |    7 
 xen/drivers/passthrough/vtd/iommu.c          |   16 
 xen/drivers/passthrough/vtd/qinval.c         |   16 
 xen/drivers/passthrough/vtd/utils.c          |    2 
 xen/include/acpi/cpufreq/cpufreq.h           |    3 
 xen/include/acpi/cpufreq/processor_perf.h    |   13 
 xen/include/asm-x86/domain.h                 |   14 
 xen/include/asm-x86/hvm/vmx/vmcs.h           |    8 
 xen/include/asm-x86/mm.h                     |    8 
 xen/include/asm-x86/perfc_defn.h             |   15 
 xen/include/public/hvm/hvm_op.h              |   13 
 xen/include/xen/domain.h                     |    3 
 xen/include/xen/sched.h                      |   12 
 92 files changed, 3996 insertions(+), 824 deletions(-)

diff -r 11318234588e -r 08f77df14cba docs/ChangeLog
--- a/docs/ChangeLog    Thu Jun 19 12:48:04 2008 +0900
+++ b/docs/ChangeLog    Wed Jul 02 11:30:37 2008 +0900
@@ -16,6 +16,15 @@ Xen 3.3 release
 Xen 3.3 release
 ---------------
 
+17903: Add greater than 16 xvd device availability
+http://xenbits.xensource.com/xen-unstable.hg?rev/0728459b3c8d
+
+The tools can now attach a disk of the form:
+(1<<28) | (device<<8) | partition
+to support many more xvd disks and up to 256 partitions.
+The linux guest frontend has been expanded to support
+this new construct, while legacy guests should just ignore it.
+       
 17538: Add XENPF_set_processor_pminfo
 http://xenbits.xensource.com/xen-unstable.hg?rev/5bb9093eb0e9
 
diff -r 11318234588e -r 08f77df14cba extras/mini-os/arch/x86/mm.c
--- a/extras/mini-os/arch/x86/mm.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/arch/x86/mm.c      Wed Jul 02 11:30:37 2008 +0900
@@ -528,18 +528,13 @@ void *map_frames_ex(unsigned long *f, un
 
 static void clear_bootstrap(void)
 {
-    xen_pfn_t mfns[] = { virt_to_mfn(&shared_info) };
-    int n = sizeof(mfns)/sizeof(*mfns);
     pte_t nullpte = { };
 
     /* Use first page as the CoW zero page */
     memset(&_text, 0, PAGE_SIZE);
-    mfn_zero = pfn_to_mfn((unsigned long) &_text);
-    if (HYPERVISOR_update_va_mapping((unsigned long) &_text, nullpte, 
UVMF_INVLPG))
-       printk("Unable to unmap first page\n");
-
-    if (free_physical_pages(mfns, n) != n)
-       printk("Unable to free bootstrap pages\n");
+    mfn_zero = virt_to_mfn((unsigned long) &_text);
+    if (HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG))
+       printk("Unable to unmap NULL page\n");
 }
 
 void arch_init_p2m(unsigned long max_pfn)
diff -r 11318234588e -r 08f77df14cba extras/mini-os/blkfront.c
--- a/extras/mini-os/blkfront.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/blkfront.c Wed Jul 02 11:30:37 2008 +0900
@@ -125,7 +125,6 @@ struct blkfront_dev *init_blkfront(char 
 
     dev->events = NULL;
 
-    // FIXME: proper frees on failures
 again:
     err = xenbus_transaction_start(&xbt);
     if (err) {
diff -r 11318234588e -r 08f77df14cba extras/mini-os/fbfront.c
--- a/extras/mini-os/fbfront.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/fbfront.c  Wed Jul 02 11:30:37 2008 +0900
@@ -100,7 +100,6 @@ struct kbdfront_dev *init_kbdfront(char 
     s->in_cons = s->in_prod = 0;
     s->out_cons = s->out_prod = 0;
 
-    // FIXME: proper frees on failures
 again:
     err = xenbus_transaction_start(&xbt);
     if (err) {
@@ -408,7 +407,6 @@ struct fbfront_dev *init_fbfront(char *n
         s->pd[i] = 0;
 
 
-    // FIXME: proper frees on failures
 again:
     err = xenbus_transaction_start(&xbt);
     if (err) {
diff -r 11318234588e -r 08f77df14cba extras/mini-os/fs-front.c
--- a/extras/mini-os/fs-front.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/fs-front.c Wed Jul 02 11:30:37 2008 +0900
@@ -136,8 +136,8 @@ again:
 again:    
     old_id = freelist[0];
     /* Note: temporal inconsistency, since freelist[0] can be changed by 
someone
-     * else, but we are a sole owner of freelist[id], it's OK. */
-    freelist[id] = old_id;
+     * else, but we are a sole owner of freelist[id + 1], it's OK. */
+    freelist[id + 1] = old_id;
     new_id = id;
     if(cmpxchg(&freelist[0], old_id, new_id) != old_id)
     {
@@ -154,7 +154,7 @@ static inline unsigned short get_id_from
 
 again:    
     old_id = freelist[0];
-    new_id = freelist[old_id];
+    new_id = freelist[old_id + 1];
     if(cmpxchg(&freelist[0], old_id, new_id) != old_id)
     {
         printk("Cmpxchg on freelist remove failed.\n");
@@ -785,8 +785,8 @@ static void alloc_request_table(struct f
     printk("Allocating request array for import %d, nr_entries = %d.\n",
             import->import_id, import->nr_entries);
     requests = xmalloc_array(struct fs_request, import->nr_entries);
-    import->freelist = xmalloc_array(unsigned short, import->nr_entries);
-    memset(import->freelist, 0, sizeof(unsigned short) * import->nr_entries);
+    import->freelist = xmalloc_array(unsigned short, import->nr_entries + 1);
+    memset(import->freelist, 0, sizeof(unsigned short) * (import->nr_entries + 
1));
     for(i=0; i<import->nr_entries; i++)
     {
        /* TODO: that's a lot of memory */
diff -r 11318234588e -r 08f77df14cba extras/mini-os/lib/sys.c
--- a/extras/mini-os/lib/sys.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/lib/sys.c  Wed Jul 02 11:30:37 2008 +0900
@@ -686,7 +686,7 @@ static int select_poll(int nfds, fd_set 
 #ifdef LIBC_VERBOSE
     static int nb;
     static int nbread[NOFILE], nbwrite[NOFILE], nbexcept[NOFILE];
-    static s64_t lastshown;
+    static s_time_t lastshown;
 
     nb++;
 #endif
diff -r 11318234588e -r 08f77df14cba extras/mini-os/netfront.c
--- a/extras/mini-os/netfront.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/netfront.c Wed Jul 02 11:30:37 2008 +0900
@@ -38,7 +38,7 @@ struct netfront_dev {
 struct netfront_dev {
     domid_t dom;
 
-    unsigned short tx_freelist[NET_TX_RING_SIZE];
+    unsigned short tx_freelist[NET_TX_RING_SIZE + 1];
     struct semaphore tx_sem;
 
     struct net_buffer rx_buffers[NET_RX_RING_SIZE];
@@ -70,14 +70,14 @@ void init_rx_buffers(struct netfront_dev
 
 static inline void add_id_to_freelist(unsigned int id,unsigned short* freelist)
 {
-    freelist[id] = freelist[0];
+    freelist[id + 1] = freelist[0];
     freelist[0]  = id;
 }
 
 static inline unsigned short get_id_from_freelist(unsigned short* freelist)
 {
     unsigned int id = freelist[0];
-    freelist[0] = freelist[id];
+    freelist[0] = freelist[id + 1];
     return id;
 }
 
diff -r 11318234588e -r 08f77df14cba stubdom/grub.patches/99minios
--- a/stubdom/grub.patches/99minios     Thu Jun 19 12:48:04 2008 +0900
+++ b/stubdom/grub.patches/99minios     Wed Jul 02 11:30:37 2008 +0900
@@ -832,7 +832,18 @@ Index: grub/stage2/fsys_reiserfs.c
 Index: grub/stage2/fsys_reiserfs.c
 ===================================================================
 --- grub.orig/stage2/fsys_reiserfs.c   2008-06-16 15:18:03.410933000 +0100
-+++ grub/stage2/fsys_reiserfs.c        2008-06-16 15:18:14.786009000 +0100
++++ grub/stage2/fsys_reiserfs.c        2008-06-20 18:33:52.002100000 +0100
+@@ -224,8 +224,8 @@
+ 
+ struct disk_child
+ {
+-  unsigned long       dc_block_number;              /* Disk child's block 
number. */
+-  unsigned short      dc_size;                            /* Disk child's 
used space.   */
++  __u32       dc_block_number;              /* Disk child's block number. */
++  __u16      dc_size;                     /* Disk child's used space.   */
+ };
+ 
+ #define DC_SIZE (sizeof (struct disk_child))
 @@ -369,7 +369,14 @@
  static __inline__ unsigned long
  log2 (unsigned long word)
diff -r 11318234588e -r 08f77df14cba stubdom/grub/Makefile
--- a/stubdom/grub/Makefile     Thu Jun 19 12:48:04 2008 +0900
+++ b/stubdom/grub/Makefile     Wed Jul 02 11:30:37 2008 +0900
@@ -5,7 +5,7 @@ vpath %.c ../grub-cvs
 
 BOOT=boot-$(XEN_TARGET_ARCH).o
 
-DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I.
+DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I$(XEN_ROOT)/tools/include -I.
 DEF_CPPFLAGS += -I../grub-cvs/stage1
 DEF_CPPFLAGS += -I../grub-cvs/stage2
 DEF_CPPFLAGS += -I../grub-cvs/netboot
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/Makefile
--- a/tools/blktap/drivers/Makefile     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/Makefile     Wed Jul 02 11:30:37 2008 +0900
@@ -17,8 +17,16 @@ CFLAGS   += -Wp,-MD,.$(@F).d
 CFLAGS   += -Wp,-MD,.$(@F).d
 DEPS      = .*.d
 
+ifeq ($(shell . ./check_gcrypt),"yes")
+CFLAGS += -DUSE_GCRYPT
+CRYPT_LIB := -lgcrypt
+else
+CRYPT_LIB := -lcrypto
+$(warning *** libgcrypt not installed: falling back to libcrypto ***)
+endif
+
 LDFLAGS_blktapctrl := $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenstore) -L../lib 
-lblktap
-LDFLAGS_img := $(LIBAIO_DIR)/libaio.a -lcrypto -lpthread -lz
+LDFLAGS_img := $(LIBAIO_DIR)/libaio.a $(CRYPT_LIB) -lpthread -lz
 
 BLK-OBJS-y  := block-aio.o
 BLK-OBJS-y  += block-sync.o
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/blktapctrl.c
--- a/tools/blktap/drivers/blktapctrl.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/blktapctrl.c Wed Jul 02 11:30:37 2008 +0900
@@ -127,7 +127,7 @@ static int get_new_dev(int *major, int *
        char *devname;
        
        tr.domid = blkif->domid;
-        tr.busid = (unsigned short)blkif->be_id;
+        tr.busid = blkif->be_id;
        ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF, tr );
        
        if ( (ret <= 0)||(ret > MAX_TAP_DEV) ) {
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/block-qcow.c
--- a/tools/blktap/drivers/block-qcow.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/block-qcow.c Wed Jul 02 11:30:37 2008 +0900
@@ -33,7 +33,6 @@
 #include <zlib.h>
 #include <inttypes.h>
 #include <libaio.h>
-#include <openssl/md5.h>
 #include "bswap.h"
 #include "aes.h"
 #include "tapdisk.h"
@@ -146,6 +145,35 @@ struct tdqcow_state {
 
 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
 
+#ifdef USE_GCRYPT
+
+#include <gcrypt.h>
+
+static uint32_t gen_cksum(char *ptr, int len)
+{
+       int i;
+       uint32_t md[4];
+
+       /* Convert L1 table to big endian */
+       for(i = 0; i < len / sizeof(uint64_t); i++) {
+               cpu_to_be64s(&((uint64_t*) ptr)[i]);
+       }
+
+       /* Generate checksum */
+       gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
+
+       /* Convert L1 table back to native endianess */
+       for(i = 0; i < len / sizeof(uint64_t); i++) {
+               be64_to_cpus(&((uint64_t*) ptr)[i]);
+       }
+
+       return md[0];
+}
+
+#else /* use libcrypto */
+
+#include <openssl/md5.h>
+
 static uint32_t gen_cksum(char *ptr, int len)
 {
        int i;
@@ -153,9 +181,8 @@ static uint32_t gen_cksum(char *ptr, int
        uint32_t ret;
 
        md = malloc(MD5_DIGEST_LENGTH);
-
        if(!md) return 0;
-       
+
        /* Convert L1 table to big endian */
        for(i = 0; i < len / sizeof(uint64_t); i++) {
                cpu_to_be64s(&((uint64_t*) ptr)[i]);
@@ -175,6 +202,8 @@ static uint32_t gen_cksum(char *ptr, int
        free(md);
        return ret;
 }
+
+#endif
 
 static int get_filesize(char *filename, uint64_t *size, struct stat *st)
 {
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/block-qcow2.c
--- a/tools/blktap/drivers/block-qcow2.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/block-qcow2.c        Wed Jul 02 11:30:37 2008 +0900
@@ -254,10 +254,7 @@ static int bdrv_pread(int fd, int64_t of
  */
 static int bdrv_pwrite(int fd, int64_t offset, const void *buf, int count)
 {
-       int ret;
-       
-       ret = lseek(fd, offset, SEEK_SET);
-       if (ret != offset) {
+       if (lseek(fd, offset, SEEK_SET) == -1) {
                DPRINTF("bdrv_pwrite failed seek (%#"PRIx64").\n", offset);
                return -1;
        }
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/check_gcrypt
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap/drivers/check_gcrypt Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+cat > .gcrypt.c << EOF
+#include <gcrypt.h>
+int main(void) { return 0; }
+EOF
+
+if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then
+  echo "yes"
+else
+  echo "no"
+fi
+
+rm -f .gcrypt*
diff -r 11318234588e -r 08f77df14cba tools/blktap/lib/blktaplib.h
--- a/tools/blktap/lib/blktaplib.h      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/lib/blktaplib.h      Wed Jul 02 11:30:37 2008 +0900
@@ -161,7 +161,7 @@ typedef struct tapdev_info {
 
 typedef struct domid_translate {
        unsigned short domid;
-       unsigned short busid;
+       uint32_t busid;
 } domid_translate_t ;
 
 typedef struct image {
diff -r 11318234588e -r 08f77df14cba tools/debugger/xenitp/xenitp.c
--- a/tools/debugger/xenitp/xenitp.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/debugger/xenitp/xenitp.c    Wed Jul 02 11:30:37 2008 +0900
@@ -57,6 +57,16 @@ static int cur_vcpu;
 #define CFM_SOF_MASK            0x3f
 
 int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr);
+
+/* wrapper for vcpu_gest_context_any_t */
+static int xc_ia64_vcpu_getcontext(int xc_handle,
+                                   uint32_t domid,
+                                   uint32_t vcpu,
+                                   vcpu_guest_context_t *ctxt)
+{
+    return xc_vcpu_getcontext(xc_handle, domid, vcpu,
+                              (vcpu_guest_context_any_t *)ctxt);
+}
 
 static inline unsigned int ctx_slot (vcpu_guest_context_t *ctx)
 {
@@ -729,7 +739,7 @@ int wait_domain (int vcpu, vcpu_guest_co
         fflush (stdout);
         nanosleep (&ts, NULL);
     }
-    return xc_vcpu_getcontext (xc_handle, domid, vcpu, ctx);
+    return xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, ctx);
 }
 
 int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr)
@@ -945,13 +955,13 @@ char *parse_arg (char **buf)
     return res;
 }
 
-vcpu_guest_context_t vcpu_ctx[MAX_VIRT_CPUS];
+vcpu_guest_context_any_t vcpu_ctx_any[MAX_VIRT_CPUS];
 
 int vcpu_setcontext (int vcpu)
 {
     int ret;
 
-    ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx[vcpu]);
+    ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx_any[vcpu]);
     if (ret < 0)
         perror ("xc_vcpu_setcontext");
 
@@ -1518,7 +1528,7 @@ enum cmd_status do_command (int vcpu, ch
     int flag_ambiguous;
 
     cur_vcpu = vcpu;
-    cur_ctx = &vcpu_ctx[vcpu];
+    cur_ctx = &vcpu_ctx_any[vcpu].c;
 
     /* Handle repeat last-command.  */
     if (*line == 0) {
@@ -1575,7 +1585,7 @@ void xenitp (int vcpu)
     int ret;
     struct sigaction sa;
 
-    cur_ctx = &vcpu_ctx[vcpu];
+    cur_ctx = &vcpu_ctx_any[vcpu].c;
 
     xc_handle = xc_interface_open (); /* for accessing control interface */
 
@@ -1588,9 +1598,9 @@ void xenitp (int vcpu)
         exit (-1);
     }
 
-    ret = xc_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx);
+    ret = xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx);
     if (ret < 0) {
-        perror ("xc_vcpu_getcontext");
+        perror ("xc_ia64_vcpu_getcontext");
         exit (-1);
     }
 
diff -r 11318234588e -r 08f77df14cba tools/examples/xend-config.sxp
--- a/tools/examples/xend-config.sxp    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/examples/xend-config.sxp    Wed Jul 02 11:30:37 2008 +0900
@@ -242,3 +242,6 @@
 
 # Script to run when the label of a resource has changed.
 #(resource-label-change-script '')
+
+# Rotation count of qemu-dm log file.
+#(qemu-dm-logrotate-count 10)
diff -r 11318234588e -r 08f77df14cba tools/firmware/hvmloader/hvmloader.c
--- a/tools/firmware/hvmloader/hvmloader.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/firmware/hvmloader/hvmloader.c      Wed Jul 02 11:30:37 2008 +0900
@@ -206,10 +206,12 @@ static void pci_setup(void)
             pci_writew(devfn, 0x3d, 0x0001);
             break;
         case 0x0101:
-            /* PIIX3 IDE */
-            ASSERT((vendor_id == 0x8086) && (device_id == 0x7010));
-            pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
-            pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
+            if ( vendor_id == 0x8086 )
+            {
+                /* Intel ICHs since PIIX3: enable IDE legacy mode. */
+                pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
+                pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
+            }
             break;
         }
 
diff -r 11318234588e -r 08f77df14cba tools/firmware/rombios/rombios.c
--- a/tools/firmware/rombios/rombios.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/firmware/rombios/rombios.c  Wed Jul 02 11:30:37 2008 +0900
@@ -9783,6 +9783,27 @@ smbios_init:
 
 #endif
 
+#if BX_TCGBIOS
+; The section between the POST entry and the NMI entry is filling up
+; and causes crashes if this code was directly there
+tcpa_post_part1:
+  call _tcpa_acpi_init
+
+  push dword #0
+  call _tcpa_initialize_tpm
+  add sp, #4
+
+  call _tcpa_do_measure_POSTs
+  call _tcpa_wake_event     /* specs: 3.2.3.7 */
+  ret
+
+tcpa_post_part2:
+  call _tcpa_calling_int19h          /* specs: 8.2.3 step 1 */
+  call _tcpa_add_event_separators    /* specs: 8.2.3 step 2 */
+  /* we do not call int 19h handler but keep following eventlog */
+  call _tcpa_returned_int19h         /* specs: 8.2.3 step 3/7 */
+  ret
+#endif
 
 
 ;; for 'C' strings and other data, insert them here with
@@ -10003,14 +10024,7 @@ post_default_ints:
   mov  0x0410, ax
 
 #if BX_TCGBIOS
-  call _tcpa_acpi_init
-
-  push dword #0
-  call _tcpa_initialize_tpm
-  add sp, #4
-
-  call _tcpa_do_measure_POSTs
-  call _tcpa_wake_event     /* specs: 3.2.3.7 */
+  call tcpa_post_part1
 #endif
 
   ;; Parallel setup
@@ -10138,10 +10152,7 @@ post_default_ints:
   call _interactive_bootkey
 
 #if BX_TCGBIOS
-  call _tcpa_calling_int19h          /* specs: 8.2.3 step 1 */
-  call _tcpa_add_event_separators    /* specs: 8.2.3 step 2 */
-  /* we do not call int 19h handler but keep following eventlog */
-  call _tcpa_returned_int19h         /* specs: 8.2.3 step 3/7 */
+  call tcpa_post_part2
 #endif
 
   ;; Start the boot sequence.   See the comments in int19_relocated 
diff -r 11318234588e -r 08f77df14cba tools/ioemu/hw/xen_console.c
--- a/tools/ioemu/hw/xen_console.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/ioemu/hw/xen_console.c      Wed Jul 02 11:30:37 2008 +0900
@@ -160,16 +160,18 @@ int xs_gather(struct xs_handle *xs, cons
 
 static int domain_create_ring(struct domain *dom)
 {
-       int err, remote_port, ring_ref, rc;
+       int err, remote_port, ring_ref, limit, rc;
 
        err = xs_gather(dom->xsh, dom->serialpath,
                        "ring-ref", "%u", &ring_ref,
                        "port", "%i", &remote_port,
+                       "limit", "%i", &limit,
                        NULL);
        if (err) {
                err = xs_gather(dom->xsh, dom->conspath,
                                "ring-ref", "%u", &ring_ref,
                                "port", "%i", &remote_port,
+                               "limit", "%i", &limit,
                                NULL);
                if (err) {
                        fprintf(stderr, "Console: failed to find ring-ref/port 
yet\n");
@@ -178,7 +180,9 @@ static int domain_create_ring(struct dom
                dom->use_consolepath = 1;
        } else
                dom->use_consolepath = 0;
-       fprintf(stderr, "Console: got ring-ref %d port %d\n", ring_ref, 
remote_port);
+       dom->buffer.max_capacity = limit;
+       fprintf(stderr, "Console: got ring-ref %d port %d limit %d\n", 
+               ring_ref, remote_port, limit);
 
        if ((ring_ref == dom->ring_ref) && (remote_port == dom->remote_port))
                goto out;
diff -r 11318234588e -r 08f77df14cba tools/ioemu/target-i386-dm/exec-dm.c
--- a/tools/ioemu/target-i386-dm/exec-dm.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/ioemu/target-i386-dm/exec-dm.c      Wed Jul 02 11:30:37 2008 +0900
@@ -483,9 +483,11 @@ static void memcpy_words(void *dst, void
 }
 #endif
 
-void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, 
-                            int len, int is_write)
-{
+void cpu_physical_memory_rw(target_phys_addr_t _addr, uint8_t *buf, 
+                            int _len, int is_write)
+{
+    target_phys_addr_t addr = _addr;
+    int len = _len;
     int l, io_index;
     uint8_t *ptr;
     uint32_t val;
@@ -520,6 +522,7 @@ void cpu_physical_memory_rw(target_phys_
             } else if ((ptr = phys_ram_addr(addr)) != NULL) {
                 /* Writing to RAM */
                 memcpy_words(ptr, buf, l);
+#ifndef CONFIG_STUBDOM
                 if (logdirty_bitmap != NULL) {
                     /* Record that we have dirtied this frame */
                     unsigned long pfn = addr >> TARGET_PAGE_BITS;
@@ -531,6 +534,7 @@ void cpu_physical_memory_rw(target_phys_
                             |= 1UL << pfn % HOST_LONG_BITS;
                     }
                 }
+#endif
 #ifdef __ia64__
                 sync_icache(ptr, l);
 #endif 
@@ -566,6 +570,13 @@ void cpu_physical_memory_rw(target_phys_
         addr += l;
     }
 
+#ifdef CONFIG_STUBDOM
+    if (logdirty_bitmap != NULL)
+        xc_hvm_modified_memory(xc_handle, domid, _addr >> TARGET_PAGE_BITS,
+                (_addr + _len + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS
+                    - _addr >> TARGET_PAGE_BITS);
+#endif
+
     mapcache_unlock();
 }
 #endif
diff -r 11318234588e -r 08f77df14cba tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/ioemu/xenstore.c    Wed Jul 02 11:30:37 2008 +0900
@@ -260,8 +260,6 @@ void xenstore_parse_domain_config(int hv
                    /* autoguess qcow vs qcow2 */
                } else if (!strcmp(drv,"file") || !strcmp(drv,"phy")) {
                    format = &bdrv_raw;
-               } else if (!strcmp(drv,"phy")) {
-                   format = &bdrv_raw;
                } else {
                    format = bdrv_find_format(drv);
                    if (!format) {
@@ -404,6 +402,10 @@ void xenstore_process_logdirty_event(voi
             /* No key yet: wait for the next watch */
             return;
 
+#ifdef CONFIG_STUBDOM
+        /* We pass the writes to hypervisor */
+        seg = (void*)1;
+#else
         strncpy(key_terminated, key_ascii, 16);
         free(key_ascii);
         key = (key_t) strtoull(key_terminated, NULL, 16);
@@ -419,11 +421,6 @@ void xenstore_process_logdirty_event(voi
         fprintf(logfile, "%s: key=%16.16llx size=%lu\n", __FUNCTION__,
                 (unsigned long long)key, logdirty_bitmap_size);
 
-#ifdef CONFIG_STUBDOM
-        /* XXX we just can't use shm. */
-        fprintf(logfile, "Log dirty is not implemented in stub domains!\n");
-        return;
-#else
         shmid = shmget(key, 2 * logdirty_bitmap_size, S_IRUSR|S_IWUSR);
         if (shmid == -1) {
             fprintf(logfile, "Log-dirty: shmget failed: segment %16.16llx "
diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_hvm_build.c
--- a/tools/libxc/ia64/xc_ia64_hvm_build.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_hvm_build.c      Wed Jul 02 11:30:37 2008 +0900
@@ -1052,7 +1052,8 @@ int
 int
 xc_hvm_build(int xc_handle, uint32_t domid, int memsize, const char 
*image_name)
 {
-    vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt;
+    vcpu_guest_context_any_t st_ctxt_any;
+    vcpu_guest_context_t *ctxt = &st_ctxt_any.c;
     char *image = NULL;
     unsigned long image_size;
     unsigned long nr_pages;
@@ -1079,14 +1080,14 @@ xc_hvm_build(int xc_handle, uint32_t dom
 
     free(image);
 
-    memset(ctxt, 0, sizeof(*ctxt));
+    memset(&st_ctxt_any, 0, sizeof(st_ctxt_any));
     ctxt->regs.ip = 0x80000000ffffffb0UL;
     ctxt->regs.ar.fpsr = xc_ia64_fpsr_default();
     ctxt->regs.cr.itir = 14 << 2;
     ctxt->regs.psr = IA64_PSR_AC | IA64_PSR_BN;
     ctxt->regs.cr.dcr = 0;
     ctxt->regs.cr.pta = 15 << 2;
-    return xc_vcpu_setcontext(xc_handle, domid, 0, ctxt);
+    return xc_vcpu_setcontext(xc_handle, domid, 0, &st_ctxt_any);
 
 error_out:
     free(image);
diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_linux_restore.c
--- a/tools/libxc/ia64/xc_ia64_linux_restore.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_linux_restore.c  Wed Jul 02 11:30:37 2008 +0900
@@ -117,8 +117,9 @@ xc_ia64_recv_unallocated_list(int xc_han
 
 static int
 xc_ia64_recv_vcpu_context(int xc_handle, int io_fd, uint32_t dom,
-                          uint32_t vcpu, vcpu_guest_context_t *ctxt)
-{
+                          uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any)
+{
+    vcpu_guest_context_t *ctxt = &ctxt_any->c;
     if (read_exact(io_fd, ctxt, sizeof(*ctxt))) {
         ERROR("Error when reading ctxt");
         return -1;
@@ -128,14 +129,14 @@ xc_ia64_recv_vcpu_context(int xc_handle,
 
     /* Initialize and set registers.  */
     ctxt->flags = VGCF_EXTRA_REGS | VGCF_SET_CR_IRR | VGCF_online;
-    if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt) != 0) {
+    if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt_any) != 0) {
         ERROR("Couldn't set vcpu context");
         return -1;
     }
 
     /* Just a check.  */
     ctxt->flags = 0;
-    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) {
+    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) {
         ERROR("Could not get vcpu context");
         return -1;
     }
@@ -226,19 +227,20 @@ xc_ia64_pv_recv_vcpu_context(int xc_hand
     int rc = -1;
 
     /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_t ctxt;
-    
-    if (lock_pages(&ctxt, sizeof(ctxt))) {
+    vcpu_guest_context_any_t ctxt_any;
+    vcpu_guest_context_t *ctxt = &ctxt_any.c;
+
+    if (lock_pages(&ctxt_any, sizeof(ctxt_any))) {
         /* needed for build domctl, but might as well do early */
         ERROR("Unable to lock_pages ctxt");
         return -1;
     }
 
-    if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt))
+    if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt_any))
         goto out;
 
     /* Then get privreg page.  */
-    if (read_page(xc_handle, io_fd, dom, ctxt.privregs_pfn) < 0) {
+    if (read_page(xc_handle, io_fd, dom, ctxt->privregs_pfn) < 0) {
         ERROR("Could not read vcpu privregs");
         goto out;
     }
@@ -441,12 +443,12 @@ xc_ia64_hvm_recv_context(int xc_handle, 
     /* vcpu context */
     for (i = 0; i <= info.max_vcpu_id; i++) {
         /* A copy of the CPU context of the guest. */
-        vcpu_guest_context_t ctxt;
+        vcpu_guest_context_any_t ctxt_any;
 
         if (!__test_bit(i, vcpumap))
             continue;
 
-        if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
+        if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
             goto out;
 
         /* system context of vcpu is recieved as hvm context. */
diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_linux_save.c
--- a/tools/libxc/ia64/xc_ia64_linux_save.c     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_linux_save.c     Wed Jul 02 11:30:37 2008 +0900
@@ -180,9 +180,10 @@ xc_ia64_send_unallocated_list(int xc_han
 
 static int
 xc_ia64_send_vcpu_context(int xc_handle, int io_fd, uint32_t dom,
-                          uint32_t vcpu, vcpu_guest_context_t *ctxt)
-{
-    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) {
+                          uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any)
+{
+    vcpu_guest_context_t *ctxt = &ctxt_any->c;
+    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) {
         ERROR("Could not get vcpu context");
         return -1;
     }
@@ -269,17 +270,19 @@ xc_ia64_pv_send_context(int xc_handle, i
     /* vcpu context */
     for (i = 0; i <= info->max_vcpu_id; i++) {
         /* A copy of the CPU context of the guest. */
-        vcpu_guest_context_t ctxt;
+        vcpu_guest_context_any_t ctxt_any;
+        vcpu_guest_context_t *ctxt = &ctxt_any.c;
+
         char *mem;
 
         if (!__test_bit(i, vcpumap))
             continue;
 
-        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
+        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
             goto out;
 
         mem = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                   PROT_READ|PROT_WRITE, ctxt.privregs_pfn);
+                                   PROT_READ|PROT_WRITE, ctxt->privregs_pfn);
         if (mem == NULL) {
             ERROR("cannot map privreg page");
             goto out;
@@ -337,12 +340,12 @@ xc_ia64_hvm_send_context(int xc_handle, 
     /* vcpu context */
     for (i = 0; i <= info->max_vcpu_id; i++) {
         /* A copy of the CPU context of the guest. */
-        vcpu_guest_context_t ctxt;
+        vcpu_guest_context_any_t ctxt_any;
 
         if (!__test_bit(i, vcpumap))
             continue;
 
-        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
+        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
             goto out;
 
         /* system context of vcpu is sent as hvm context. */
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core.c
--- a/tools/libxc/xc_core.c     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_core.c     Wed Jul 02 11:30:37 2008 +0900
@@ -407,7 +407,7 @@ xc_domain_dumpcore_via_callback(int xc_h
 
     int nr_vcpus = 0;
     char *dump_mem, *dump_mem_start = NULL;
-    vcpu_guest_context_t  ctxt[MAX_VIRT_CPUS];
+    vcpu_guest_context_any_t  ctxt[MAX_VIRT_CPUS];
     struct xc_core_arch_context arch_ctxt;
     char dummy[PAGE_SIZE];
     int dummy_len;
@@ -581,10 +581,10 @@ xc_domain_dumpcore_via_callback(int xc_h
         PERROR("Could not get section header for .xen_prstatus");
         goto out;
     }
-    filesz = sizeof(ctxt[0]) * nr_vcpus;
+    filesz = sizeof(ctxt[0].c) * nr_vcpus;
     sts = xc_core_shdr_set(shdr, strtab, XEN_DUMPCORE_SEC_PRSTATUS,
                            SHT_PROGBITS, offset, filesz,
-                           __alignof__(ctxt[0]), sizeof(ctxt[0]));
+                           __alignof__(ctxt[0].c), sizeof(ctxt[0].c));
     if ( sts != 0 )
         goto out;
     offset += filesz;
@@ -707,7 +707,7 @@ xc_domain_dumpcore_via_callback(int xc_h
         goto out;
 
     /* prstatus: .xen_prstatus */
-    sts = dump_rtn(args, (char *)&ctxt, sizeof(ctxt[0]) * nr_vcpus);
+    sts = dump_rtn(args, (char *)&ctxt[0].c, sizeof(ctxt[0].c) * nr_vcpus);
     if ( sts != 0 )
         goto out;
 
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core_ia64.c
--- a/tools/libxc/xc_core_ia64.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_core_ia64.c        Wed Jul 02 11:30:37 2008 +0900
@@ -308,9 +308,10 @@ xc_core_arch_context_free(struct xc_core
 
 int
 xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt,
-                         vcpu_guest_context_t* ctxt,
+                         vcpu_guest_context_any_t* ctxt_any,
                          int xc_handle, uint32_t domid)
 {
+    vcpu_guest_context_t *ctxt = &ctxt_any->c;
     mapped_regs_t* mapped_regs;
 
     if ( ctxt->privregs_pfn == VGC_PRIVREGS_HVM )
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core_ia64.h
--- a/tools/libxc/xc_core_ia64.h        Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_core_ia64.h        Wed Jul 02 11:30:37 2008 +0900
@@ -40,7 +40,7 @@ xc_core_arch_context_free(struct xc_core
 xc_core_arch_context_free(struct xc_core_arch_context* arch_ctxt);
 int
 xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt,
-                         vcpu_guest_context_t* ctxt,
+                         vcpu_guest_context_any_t* ctxt,
                          int xc_handle, uint32_t domid);
 int
 xc_core_arch_context_get_shdr(struct xc_core_arch_context* arch_ctxt, 
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_domain.c   Wed Jul 02 11:30:37 2008 +0900
@@ -298,30 +298,21 @@ int xc_vcpu_getcontext(int xc_handle,
 int xc_vcpu_getcontext(int xc_handle,
                        uint32_t domid,
                        uint32_t vcpu,
-                       vcpu_guest_context_t *ctxt)
-{
-    int rc;
-    DECLARE_DOMCTL;
-    size_t sz = sizeof(vcpu_guest_context_either_t);
+                       vcpu_guest_context_any_t *ctxt)
+{
+    int rc;
+    DECLARE_DOMCTL;
+    size_t sz = sizeof(vcpu_guest_context_any_t);
 
     domctl.cmd = XEN_DOMCTL_getvcpucontext;
     domctl.domain = (domid_t)domid;
     domctl.u.vcpucontext.vcpu   = (uint16_t)vcpu;
-    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
-
-    /*
-     * We may be asked to lock either a 32-bit or a 64-bit context. Lock the
-     * larger of the two if possible, otherwise fall back to native size.
-     */
+    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c);
+
+    
     if ( (rc = lock_pages(ctxt, sz)) != 0 )
-    {
-        sz = sizeof(*ctxt);
-        if ( (rc = lock_pages(ctxt, sz)) != 0 )
-            return rc;
-    }
-
+        return rc;
     rc = do_domctl(xc_handle, &domctl);
-
     unlock_pages(ctxt, sz);
 
     return rc;
@@ -626,32 +617,28 @@ int xc_vcpu_setcontext(int xc_handle,
 int xc_vcpu_setcontext(int xc_handle,
                        uint32_t domid,
                        uint32_t vcpu,
-                       vcpu_guest_context_t *ctxt)
-{
-    DECLARE_DOMCTL;
-    int rc;
-    size_t sz = sizeof(vcpu_guest_context_either_t);
+                       vcpu_guest_context_any_t *ctxt)
+{
+    DECLARE_DOMCTL;
+    int rc;
+    size_t sz = sizeof(vcpu_guest_context_any_t);
+
+    if (ctxt == NULL)
+    {
+        errno = EINVAL;
+        return -1;
+    }
 
     domctl.cmd = XEN_DOMCTL_setvcpucontext;
     domctl.domain = domid;
     domctl.u.vcpucontext.vcpu = vcpu;
-    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
-
-    /*
-     * We may be asked to lock either a 32-bit or a 64-bit context. Lock the
-     * larger of the two if possible, otherwise fall back to native size.
-     */
-    if ( (ctxt != NULL) && (rc = lock_pages(ctxt, sz)) != 0 )
-    {
-        sz = sizeof(*ctxt);
-        if ( (rc = lock_pages(ctxt, sz)) != 0 )
-            return rc;
-    }
-
+    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c);
+
+    if ( (rc = lock_pages(ctxt, sz)) != 0 )
+        return rc;
     rc = do_domctl(xc_handle, &domctl);
-
-    if ( ctxt != NULL )
-        unlock_pages(ctxt, sz);
+    
+    unlock_pages(ctxt, sz);
 
     return rc;
 }
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_domain_restore.c   Wed Jul 02 11:30:37 2008 +0900
@@ -153,7 +153,7 @@ static xen_pfn_t *load_p2m_frame_list(
     int io_fd, int *pae_extended_cr3, int *ext_vcpucontext)
 {
     xen_pfn_t *p2m_frame_list;
-    vcpu_guest_context_either_t ctxt;
+    vcpu_guest_context_any_t ctxt;
     xen_pfn_t p2m_fl_zero;
 
     /* Read first entry of P2M list, or extended-info signature (~0UL). */
@@ -284,12 +284,12 @@ int xc_domain_restore(int xc_handle, int
     /* The new domain's shared-info frame number. */
     unsigned long shared_info_frame;
     unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
-    shared_info_either_t *old_shared_info = 
-        (shared_info_either_t *)shared_info_page;
-    shared_info_either_t *new_shared_info;
+    shared_info_any_t *old_shared_info = 
+        (shared_info_any_t *)shared_info_page;
+    shared_info_any_t *new_shared_info;
 
     /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_either_t ctxt;
+    vcpu_guest_context_any_t ctxt;
 
     /* A table containing the type of each PFN (/not/ MFN!). */
     unsigned long *pfn_type = NULL;
@@ -304,7 +304,7 @@ int xc_domain_restore(int xc_handle, int
     xen_pfn_t *p2m_frame_list = NULL;
     
     /* A temporary mapping of the guest's start_info page. */
-    start_info_either_t *start_info;
+    start_info_any_t *start_info;
 
     /* Our mapping of the current region (batch) */
     char *region_base;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_domain_save.c      Wed Jul 02 11:30:37 2008 +0900
@@ -412,7 +412,7 @@ static int suspend_and_state(int (*suspe
 ** it to update the MFN to a reasonable value.
 */
 static void *map_frame_list_list(int xc_handle, uint32_t dom,
-                                 shared_info_either_t *shinfo)
+                                 shared_info_any_t *shinfo)
 {
     int count = 100;
     void *p;
@@ -628,9 +628,9 @@ static xen_pfn_t *map_and_save_p2m_table
                                          int io_fd, 
                                          uint32_t dom,
                                          unsigned long p2m_size,
-                                         shared_info_either_t *live_shinfo)
-{
-    vcpu_guest_context_either_t ctxt;
+                                         shared_info_any_t *live_shinfo)
+{
+    vcpu_guest_context_any_t ctxt;
 
     /* Double and single indirect references to the live P2M table */
     void *live_p2m_frame_list_list = NULL;
@@ -735,7 +735,7 @@ static xen_pfn_t *map_and_save_p2m_table
         p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
     }
 
-    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
+    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
     {
         ERROR("Could not get vcpu context");
         goto out;
@@ -814,7 +814,7 @@ int xc_domain_save(int xc_handle, int io
     unsigned long shared_info_frame;
 
     /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_either_t ctxt;
+    vcpu_guest_context_any_t ctxt;
 
     /* A table containing the type of each PFN (/not/ MFN!). */
     unsigned long *pfn_type = NULL;
@@ -824,7 +824,7 @@ int xc_domain_save(int xc_handle, int io
     char page[PAGE_SIZE];
 
     /* Live mapping of shared info structure */
-    shared_info_either_t *live_shinfo = NULL;
+    shared_info_any_t *live_shinfo = NULL;
 
     /* base of the region in which domain memory is mapped */
     unsigned char *region_base = NULL;
@@ -1536,7 +1536,7 @@ int xc_domain_save(int xc_handle, int io
         }
     }
 
-    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
+    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
     {
         ERROR("Could not get vcpu context");
         goto out;
@@ -1556,7 +1556,7 @@ int xc_domain_save(int xc_handle, int io
         if ( !(vcpumap & (1ULL << i)) )
             continue;
 
-        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) )
+        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
         {
             ERROR("No context for VCPU%d", i);
             goto out;
@@ -1624,7 +1624,7 @@ int xc_domain_save(int xc_handle, int io
      * Reset the MFN to be a known-invalid value. See map_frame_list_list().
      */
     memcpy(page, live_shinfo, PAGE_SIZE);
-    SET_FIELD(((shared_info_either_t *)page), 
+    SET_FIELD(((shared_info_any_t *)page), 
               arch.pfn_to_mfn_frame_list_list, 0);
     if ( write_exact(io_fd, page, PAGE_SIZE) )
     {
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_misc.c     Wed Jul 02 11:30:37 2008 +0900
@@ -253,6 +253,34 @@ int xc_hvm_track_dirty_vram(
     arg.first_pfn = first_pfn;
     arg.nr        = nr;
     set_xen_guest_handle(arg.dirty_bitmap, (uint8_t *)dirty_bitmap);
+
+    if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 )
+    {
+        PERROR("Could not lock memory");
+        return rc;
+    }
+
+    rc = do_xen_hypercall(xc_handle, &hypercall);
+
+    unlock_pages(&arg, sizeof(arg));
+
+    return rc;
+}
+
+int xc_hvm_modified_memory(
+    int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr)
+{
+    DECLARE_HYPERCALL;
+    struct xen_hvm_modified_memory arg;
+    int rc;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_modified_memory;
+    hypercall.arg[1] = (unsigned long)&arg;
+
+    arg.domid     = dom;
+    arg.first_pfn = first_pfn;
+    arg.nr        = nr;
 
     if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 )
     {
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_pagetab.c
--- a/tools/libxc/xc_pagetab.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_pagetab.c  Wed Jul 02 11:30:37 2008 +0900
@@ -48,7 +48,7 @@ unsigned long xc_translate_foreign_addre
 unsigned long xc_translate_foreign_address(int xc_handle, uint32_t dom,
                                            int vcpu, unsigned long long virt )
 {
-    vcpu_guest_context_t ctx;
+    vcpu_guest_context_any_t ctx;
     unsigned long long cr3;
     void *pd, *pt, *pdppage = NULL, *pdp, *pml = NULL;
     unsigned long long pde, pte, pdpe, pmle;
@@ -78,7 +78,7 @@ unsigned long xc_translate_foreign_addre
         DPRINTF("failed to retreive vcpu context\n");
         goto out;
     }
-    cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.ctrlreg[3])) << PAGE_SHIFT;
+    cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.c.ctrlreg[3])) << PAGE_SHIFT;
 
     /* Page Map Level 4 */
 
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_private.h
--- a/tools/libxc/xc_private.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_private.h  Wed Jul 02 11:30:37 2008 +0900
@@ -188,9 +188,9 @@ int xc_map_foreign_ranges(int xc_handle,
                           privcmd_mmap_entry_t *entries, int nr);
 
 void *map_domain_va_core(unsigned long domfd, int cpu, void *guest_va,
-                         vcpu_guest_context_t *ctxt);
+                         vcpu_guest_context_any_t *ctxt);
 int xc_waitdomain_core(int xc_handle, int domain, int *status,
-    int options, vcpu_guest_context_t *ctxt);
+    int options, vcpu_guest_context_any_t *ctxt);
 
 void bitmap_64_to_byte(uint8_t *bp, const uint64_t *lp, int nbits);
 void bitmap_byte_to_64(uint64_t *lp, const uint8_t *bp, int nbits);
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_ptrace.c
--- a/tools/libxc/xc_ptrace.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_ptrace.c   Wed Jul 02 11:30:37 2008 +0900
@@ -40,9 +40,9 @@ static int current_isfile;
 static int current_isfile;
 static int current_is_hvm;
 
-static uint64_t                 online_cpumap;
-static uint64_t                 regs_valid;
-static vcpu_guest_context_t     ctxt[MAX_VIRT_CPUS];
+static uint64_t                         online_cpumap;
+static uint64_t                         regs_valid;
+static vcpu_guest_context_any_t      ctxt[MAX_VIRT_CPUS];
 
 extern int ffsll(long long int);
 #define FOREACH_CPU(cpumap, i)  for ( cpumap = online_cpumap; (i = 
ffsll(cpumap)); cpumap &= ~(1 << (index - 1)) )
@@ -96,9 +96,9 @@ xc_register_event_handler(thr_ev_handler
 }
 
 static inline int
-paging_enabled(vcpu_guest_context_t *v)
-{
-    unsigned long cr0 = v->ctrlreg[0];
+paging_enabled(vcpu_guest_context_any_t *v)
+{
+    unsigned long cr0 = v->c.ctrlreg[0];
     return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG);
 }
 
@@ -174,7 +174,7 @@ map_domain_va_32(
 
     l2 = xc_map_foreign_range(
          xc_handle, current_domid, PAGE_SIZE, PROT_READ,
-         xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
+         xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
     if ( l2 == NULL )
         return NULL;
 
@@ -216,7 +216,7 @@ map_domain_va_pae(
 
     l3 = xc_map_foreign_range(
         xc_handle, current_domid, PAGE_SIZE, PROT_READ,
-        xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
+        xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
     if ( l3 == NULL )
         return NULL;
 
@@ -264,12 +264,12 @@ map_domain_va_64(
     uint64_t *l4, *l3, *l2, *l1;
     static void *v[MAX_VIRT_CPUS];
 
-    if ((ctxt[cpu].ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */
+    if ((ctxt[cpu].c.ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */
         return map_domain_va_32(xc_handle, cpu, guest_va, perm);
 
     l4 = xc_map_foreign_range(
         xc_handle, current_domid, PAGE_SIZE, PROT_READ,
-        xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
+        xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
     if ( l4 == NULL )
         return NULL;
 
@@ -494,26 +494,26 @@ xc_ptrace(
     case PTRACE_GETREGS:
         if (!current_isfile && fetch_regs(xc_handle, cpu, NULL))
             goto out_error;
-        SET_PT_REGS(pt, ctxt[cpu].user_regs);
+        SET_PT_REGS(pt, ctxt[cpu].c.user_regs);
         memcpy(data, &pt, sizeof(struct gdb_regs));
         break;
 
     case PTRACE_GETFPREGS:
         if (!current_isfile && fetch_regs(xc_handle, cpu, NULL)) 
                 goto out_error;
-        memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof (elf_fpregset_t));
+        memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof (elf_fpregset_t));
         break;
 
     case PTRACE_GETFPXREGS:
         if (!current_isfile && fetch_regs(xc_handle, cpu, NULL))
                 goto out_error;
-        memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt));
+        memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof(ctxt[cpu].c.fpu_ctxt));
         break;
 
     case PTRACE_SETREGS:
         if (current_isfile)
                 goto out_unsupported; /* XXX not yet supported */
-        SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].user_regs);
+        SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].c.user_regs);
         if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
                                 &ctxt[cpu])))
             goto out_error_domctl;
@@ -525,7 +525,7 @@ xc_ptrace(
         /*  XXX we can still have problems if the user switches threads
          *  during single-stepping - but that just seems retarded
          */
-        ctxt[cpu].user_regs.eflags |= PSL_T;
+        ctxt[cpu].c.user_regs.eflags |= PSL_T;
         if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
                                 &ctxt[cpu])))
             goto out_error_domctl;
@@ -542,9 +542,9 @@ xc_ptrace(
                 if (fetch_regs(xc_handle, cpu, NULL))
                     goto out_error;
                 /* Clear trace flag */
-                if ( ctxt[cpu].user_regs.eflags & PSL_T )
+                if ( ctxt[cpu].c.user_regs.eflags & PSL_T )
                 {
-                    ctxt[cpu].user_regs.eflags &= ~PSL_T;
+                    ctxt[cpu].c.user_regs.eflags &= ~PSL_T;
                     if ((retval = xc_vcpu_setcontext(xc_handle, current_domid,
                                                 cpu, &ctxt[cpu])))
                         goto out_error_domctl;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_ptrace_core.c
--- a/tools/libxc/xc_ptrace_core.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_ptrace_core.c      Wed Jul 02 11:30:37 2008 +0900
@@ -641,24 +641,24 @@ static const struct xc_core_format_type*
 
 void *
 map_domain_va_core(unsigned long domfd, int cpu, void *guest_va,
-                   vcpu_guest_context_t *ctxt)
+                   vcpu_guest_context_any_t *ctxt)
 {
     if (current_format_type == NULL)
         return NULL;
     return (current_format_type->map_domain_va_core)(domfd, cpu, guest_va,
-                                                     ctxt);
+                                                     &ctxt->c);
 }
 
 int
 xc_waitdomain_core(int xc_handle, int domfd, int *status, int options,
-                   vcpu_guest_context_t *ctxt)
+                   vcpu_guest_context_any_t *ctxt)
 {
     int ret;
     int i;
 
     for (i = 0; i < NR_FORMAT_TYPE; i++) {
         ret = (format_type[i].waitdomain_core)(xc_handle, domfd, status,
-                                               options, ctxt);
+                                               options, &ctxt->c);
         if (ret == 0) {
             current_format_type = &format_type[i];
             break;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_resume.c
--- a/tools/libxc/xc_resume.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_resume.c   Wed Jul 02 11:30:37 2008 +0900
@@ -13,7 +13,7 @@
 
 static int modify_returncode(int xc_handle, uint32_t domid)
 {
-    vcpu_guest_context_either_t ctxt;
+    vcpu_guest_context_any_t ctxt;
     xc_dominfo_t info;
     xen_capabilities_info_t caps;
     int rc;
@@ -39,7 +39,7 @@ static int modify_returncode(int xc_hand
         return -1;
     }
 
-    if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt.c)) != 0 )
+    if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt)) != 0 )
         return rc;
 
     if ( !info.hvm )
@@ -49,7 +49,7 @@ static int modify_returncode(int xc_hand
     else
         ctxt.x32.user_regs.eax = 1;
 
-    if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt.c)) != 0 )
+    if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt)) != 0 )
         return rc;
 
     return 0;
@@ -89,7 +89,7 @@ static int xc_domain_resume_any(int xc_h
     int i, rc = -1;
 #if defined(__i386__) || defined(__x86_64__)
     unsigned long mfn, p2m_size = 0;
-    vcpu_guest_context_t ctxt;
+    vcpu_guest_context_any_t ctxt;
     start_info_t *start_info;
     shared_info_t *shinfo = NULL;
     xen_pfn_t *p2m_frame_list_list = NULL;
@@ -167,7 +167,7 @@ static int xc_domain_resume_any(int xc_h
         goto out;
     }
 
-    mfn = ctxt.user_regs.edx;
+    mfn = ctxt.c.user_regs.edx;
 
     start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
                                       PROT_READ | PROT_WRITE, mfn);
diff -r 11318234588e -r 08f77df14cba tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xenctrl.h     Wed Jul 02 11:30:37 2008 +0900
@@ -30,6 +30,11 @@
 #include <xen/xsm/acm.h>
 #include <xen/xsm/acm_ops.h>
 #include <xen/xsm/flask_op.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <xen/foreign/x86_32.h>
+#include <xen/foreign/x86_64.h>
+#endif
 
 #ifdef __ia64__
 #define XC_PAGE_SHIFT           14
@@ -162,6 +167,35 @@ typedef struct xc_dominfo {
 } xc_dominfo_t;
 
 typedef xen_domctl_getdomaininfo_t xc_domaininfo_t;
+
+typedef union 
+{
+#if defined(__i386__) || defined(__x86_64__)
+    vcpu_guest_context_x86_64_t x64;
+    vcpu_guest_context_x86_32_t x32;   
+#endif
+    vcpu_guest_context_t c;
+} vcpu_guest_context_any_t;
+
+typedef union
+{
+#if defined(__i386__) || defined(__x86_64__)
+    shared_info_x86_64_t x64;
+    shared_info_x86_32_t x32;
+#endif
+    shared_info_t s;
+} shared_info_any_t;
+
+typedef union
+{
+#if defined(__i386__) || defined(__x86_64__)
+    start_info_x86_64_t x64;
+    start_info_x86_32_t x32;
+#endif
+    start_info_t s;
+} start_info_any_t;
+
+
 int xc_domain_create(int xc_handle,
                      uint32_t ssidref,
                      xen_domain_handle_t handle,
@@ -307,7 +341,7 @@ int xc_vcpu_setcontext(int xc_handle,
 int xc_vcpu_setcontext(int xc_handle,
                        uint32_t domid,
                        uint32_t vcpu,
-                       vcpu_guest_context_t *ctxt);
+                       vcpu_guest_context_any_t *ctxt);
 /**
  * This function will return information about one or more domains, using a
  * single hypercall.  The domain information will be stored into the supplied
@@ -368,7 +402,7 @@ int xc_vcpu_getcontext(int xc_handle,
 int xc_vcpu_getcontext(int xc_handle,
                        uint32_t domid,
                        uint32_t vcpu,
-                       vcpu_guest_context_t *ctxt);
+                       vcpu_guest_context_any_t *ctxt);
 
 typedef xen_domctl_getvcpuinfo_t xc_vcpuinfo_t;
 int xc_vcpu_getinfo(int xc_handle,
@@ -894,6 +928,12 @@ int xc_hvm_track_dirty_vram(
     int xc_handle, domid_t dom,
     uint64_t first_pfn, uint64_t nr,
     unsigned long *bitmap);
+
+/*
+ * Notify that some pages got modified by the Device Model
+ */
+int xc_hvm_modified_memory(
+    int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr);
 
 typedef enum {
   XC_ERROR_NONE = 0,
diff -r 11318234588e -r 08f77df14cba tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xg_save_restore.h     Wed Jul 02 11:30:37 2008 +0900
@@ -112,28 +112,6 @@ static inline int get_platform_info(int 
 #define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL))
 
 
-/* 32-on-64 support: saving 32bit guests from 64bit tools and vice versa */
-typedef union 
-{
-    vcpu_guest_context_x86_64_t x64;
-    vcpu_guest_context_x86_32_t x32;   
-    vcpu_guest_context_t c;
-} vcpu_guest_context_either_t;
-
-typedef union 
-{
-    shared_info_x86_64_t x64;
-    shared_info_x86_32_t x32;   
-    shared_info_t s;
-} shared_info_either_t;
-
-typedef union 
-{
-    start_info_x86_64_t x64;
-    start_info_x86_32_t x32;   
-    start_info_t s;
-} start_info_either_t;
-
 #define GET_FIELD(_p, _f) ((guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f))
 
 #define SET_FIELD(_p, _f, _v) do {              \
diff -r 11318234588e -r 08f77df14cba tools/python/xen/util/blkif.py
--- a/tools/python/xen/util/blkif.py    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/util/blkif.py    Wed Jul 02 11:30:37 2008 +0900
@@ -16,8 +16,11 @@ def blkdev_name_to_number(name):
 
     n = expand_dev_name(name)
 
+    devname = 'virtual-device'
+    devnum = None
+
     try:
-        return os.stat(n).st_rdev
+        return (devname, os.stat(n).st_rdev)
     except Exception, ex:
         pass
 
@@ -25,28 +28,30 @@ def blkdev_name_to_number(name):
     if re.match( '/dev/sd[a-z]([1-9]|1[0-5])?$', n):
         major = scsi_major[(ord(n[7:8]) - ord('a')) / 16]
         minor = ((ord(n[7:8]) - ord('a')) % 16) * 16 + int(n[8:] or 0)
-        return major * 256 + minor
-    if re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n):
+        devnum = major * 256 + minor
+    elif re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n):
         major = scsi_major[((ord(n[7:8]) - ord('a') + 1) * 26 + (ord(n[8:9]) - 
ord('a'))) / 16 ]
         minor = (((ord(n[7:8]) - ord('a') + 1 ) * 26 + (ord(n[8:9]) - 
ord('a'))) % 16) * 16 + int(n[9:] or 0)
-        return major * 256 + minor
-
-    if re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n):
+        devnum = major * 256 + minor
+    elif re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n):
         ide_majors = [ 3, 22, 33, 34, 56, 57, 88, 89, 90, 91 ]
         major = ide_majors[(ord(n[7:8]) - ord('a')) / 2]
         minor = ((ord(n[7:8]) - ord('a')) % 2) * 64 + int(n[8:] or 0)
-        return major * 256 + minor
+        devnum = major * 256 + minor
+    elif re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?$', n):
+        devnum = (202 << 8) + ((ord(n[8:9]) - ord('a')) << 4) + int(n[9:] or 0)
+    elif re.match('/dev/xvd[q-z]([1-9]|1[0-5])?$', n):
+        devname = 'virtual-device-ext'
+        devnum = (1 << 28) + ((ord(n[8:9]) - ord('a')) << 8) + int(n[9:] or 0)
+    elif re.match('/dev/xvd[a-i][a-z]([1-9]|1[0-5])?$', n):
+        devname = 'virtual-device-ext'
+        devnum = (1 << 28) + (((ord(n[8:9]) - ord('a') + 1) * 26 + 
(ord(n[9:10]) - ord('a'))) << 8) + int(n[10:] or 0)
+    elif re.match( '^(0x)[0-9a-fA-F]+$', name ):
+        devnum = string.atoi(name, 16)
+    elif re.match('^[0-9]+$', name):
+        devnum = string.atoi(name, 10)
 
-    if re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?', n):
-        return 202 * 256 + 16 * (ord(n[8:9]) - ord('a')) + int(n[9:] or 0)
-
-    if re.match( '^(0x)[0-9a-fA-F]+$', name ):
-        return string.atoi(name,16)
-
-    if re.match('^[0-9]+$', name):
-        return string.atoi(name, 10)
-
-    return None
+    return (devname, devnum)
 
 def blkdev_segment(name):
     """Take the given block-device name (e.g. '/dev/sda1', 'hda')
@@ -58,7 +63,7 @@ def blkdev_segment(name):
         type:         'Disk' or identifying name for partition type
     """
     val = None
-    n = blkdev_name_to_number(name)
+    (name, n) = blkdev_name_to_number(name)
     if not n is None:
         val = { 'device'       : n,
                 'start_sector' : long(0),
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/XendConfig.py       Wed Jul 02 11:30:37 2008 +0900
@@ -1123,7 +1123,7 @@ class XendConfig(dict):
             try:
                 devid = int(dev2)
             except ValueError:
-                devid = blkdev_name_to_number(dev2)
+                (xenbus, devid) = blkdev_name_to_number(dev2)
                 if devid == None:
                     log.debug("The device %s is not device name", dev2)
                     return None
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/XendOptions.py
--- a/tools/python/xen/xend/XendOptions.py      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/XendOptions.py      Wed Jul 02 11:30:37 2008 +0900
@@ -132,6 +132,9 @@ class XendOptions:
     """Default script to configure a backend network interface"""
     vif_script = osdep.vif_script
 
+    """Default rotation count of qemu-dm log file."""
+    qemu_dm_logrotate_count = 10
+
     def __init__(self):
         self.configure()
 
@@ -350,6 +353,10 @@ class XendOptions:
 
     def get_vnc_x509_verify(self):
         return self.get_config_string('vnc-x509-verify', 
self.xend_vnc_x509_verify)
+
+    def get_qemu_dm_logrotate_count(self):
+        return self.get_config_int("qemu-dm-logrotate-count",
+                                   self.qemu_dm_logrotate_count)
 
 
 class XendOptionsFile(XendOptions):
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/image.py    Wed Jul 02 11:30:37 2008 +0900
@@ -378,13 +378,23 @@ class ImageHandler:
         # keep track of pid and spawned options to kill it later
 
         self.logfile = "/var/log/xen/qemu-dm-%s.log" %  
str(self.vm.info['name_label'])
-        if os.path.exists(self.logfile):
-            if os.path.exists(self.logfile + ".1"):
-                os.unlink(self.logfile + ".1")
-            os.rename(self.logfile, self.logfile + ".1")
+
+        # rotate log
+        logfile_mode = os.O_WRONLY|os.O_CREAT|os.O_APPEND
+        logrotate_count = XendOptions.instance().get_qemu_dm_logrotate_count()
+        if logrotate_count > 0:
+            logfile_mode |= os.O_TRUNC
+            if os.path.exists("%s.%d" % (self.logfile, logrotate_count)):
+                os.unlink("%s.%d" % (self.logfile, logrotate_count))
+            for n in range(logrotate_count - 1, 0, -1):
+                if os.path.exists("%s.%d" % (self.logfile, n)):
+                    os.rename("%s.%d" % (self.logfile, n),
+                              "%s.%d" % (self.logfile, (n + 1)))
+            if os.path.exists(self.logfile):
+                os.rename(self.logfile, self.logfile + ".1")
 
         null = os.open("/dev/null", os.O_RDONLY)
-        logfd = os.open(self.logfile, 
os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
+        logfd = os.open(self.logfile, logfile_mode)
         
         sys.stderr.flush()
         pid = os.fork()
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/server/blkif.py
--- a/tools/python/xen/xend/server/blkif.py     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/server/blkif.py     Wed Jul 02 11:30:37 2008 +0900
@@ -81,11 +81,11 @@ class BlkifController(DevController):
         if security.on() == xsconstants.XS_POLICY_ACM:
             self.do_access_control(config, uname)
 
-        devid = blkif.blkdev_name_to_number(dev)
+        (device_path, devid) = blkif.blkdev_name_to_number(dev)
         if devid is None:
             raise VmError('Unable to find number for device (%s)' % (dev))
 
-        front = { 'virtual-device' : "%i" % devid,
+        front = { device_path : "%i" % devid,
                   'device-type' : dev_type
                 }
 
@@ -204,5 +204,5 @@ class BlkifController(DevController):
                 dev = devid.split('/')[-1]
                 dev = int(dev)
             except ValueError:
-                dev = blkif.blkdev_name_to_number(dev)
+                (device_path, dev) = blkif.blkdev_name_to_number(dev)
         return dev
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xm/main.py       Wed Jul 02 11:30:37 2008 +0900
@@ -2022,8 +2022,7 @@ def xm_block_list(args):
             map(server.xenapi.VBD.get_runtime_properties, vbd_refs)
         vbd_devs = \
             map(server.xenapi.VBD.get_device, vbd_refs)
-        vbd_devids = \
-            map(blkdev_name_to_number, vbd_devs)
+        vbd_devids = [blkdev_name_to_number(x)[1] for x in vbd_devs]
         devs = map(lambda (devid, prop): [devid, map2sxp(prop)],
                    zip(vbd_devids, vbd_properties))
     else:
diff -r 11318234588e -r 08f77df14cba tools/tests/test_x86_emulator.c
--- a/tools/tests/test_x86_emulator.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/tests/test_x86_emulator.c   Wed Jul 02 11:30:37 2008 +0900
@@ -22,23 +22,22 @@ static int read(
 static int read(
     unsigned int seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    *val = 0;
-    memcpy(val, (void *)offset, bytes);
+    memcpy(p_data, (void *)offset, bytes);
     return X86EMUL_OKAY;
 }
 
 static int write(
     unsigned int seg,
     unsigned long offset,
-    unsigned long val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    memcpy((void *)offset, &val, bytes);
+    memcpy((void *)offset, p_data, bytes);
     return X86EMUL_OKAY;
 }
 
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoon-monitor
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoon-monitor       Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# xenballoon-monitor - monitor certain stats from xenballoond
+#   (run in dom0 with "watch -d xenballoon-monitor" for xentop-like output)
+#
+# Copyright (C) 2009 Oracle Corporation and/or its affiliates.
+# All rights reserved
+# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+#
+# Hint: Use "xm sched-credit -d 0 -w 2000" to watch on heavily loaded machines
+#
+echo "id   mem-kb  tgt-kb  commit   swapin  swapout      pgin     pgout 
active(sec)"
+for i in `xenstore-list /local/domain`; do
+ if [ "$i" -ne 0 ]; then
+ tot=0; tgt=0; sin=0; sout=0; pgin=0; pgout=0; cmt=0; up=0; idle=0; act=0;
+ if xenstore-exists /local/domain/$i/memory/meminfo; then
+  tot=`xenstore-read /local/domain/$i/memory/meminfo | grep MemTotal \
+   | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'`
+  cmt=`xenstore-read /local/domain/$i/memory/meminfo | grep Committed_AS \
+   | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'`
+ fi
+ if xenstore-exists /local/domain/$i/memory/selftarget; then
+  tgt=`xenstore-read /local/domain/$i/memory/selftarget`
+ fi
+ if xenstore-exists /local/domain/$i/memory/vmstat; then
+  sin=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpin \
+       | cut -d" " -f2`
+  sout=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpout \
+       | cut -d" " -f2`
+  pgin=`xenstore-read /local/domain/$i/memory/vmstat | grep pgpgin \
+       | cut -d" " -f2`
+  pgout=`xenstore-read /local/domain/$i/memory/vmstat | grep pgout \
+       | cut -d" " -f2`
+ fi
+ if xenstore-exists /local/domain/$i/memory/uptime; then
+  up=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f1`
+  idle=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f2`
+  act=`echo $up - $idle | bc -iq`
+ fi
+ printf "%2d %8d%8d%8d%9d%9d%10d%10d%10.2f\n" $i $tot $tgt $cmt $sin $sout 
$pgin $pgout $act
+ fi
+done
+echo Free memory: `xm info | grep free | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'` 
MB
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoon.conf
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoon.conf  Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,91 @@
+## Path: System/xen
+## Description: xen domain start/stop on boot
+## Type: string
+## Default: 
+
+# NOTE: "xenbus is enabled" means not only that /proc/xen/xenbus exists
+# but also that /usr/bin/xenstore-* tools are installed.
+
+## Type: boolean
+## Default: false
+#
+# If XENBALLOON_SELF is true, selfballooning will occur, meaning the
+# balloon driver will grow and shrink according to available memory.
+# If xenbus is enabled, may be overridden by {memory/selfballoon}==0
+# If false but xenballoond is able to communicate with domain0 via
+# xenbus, balloon targets will be set by domain0
+# 
+XENBALLOON_SELF=false
+
+## Type: integer (must be > 0)
+## Default: 1
+#
+# If self-ballooning, number of seconds between checks/adjustments.
+# If xenbus is enabled, may be overridden by {memory/interval}
+XENBALLOON_SELF_INTERVAL=1
+
+## Type: integer (must be > 0)
+## Default: 1
+#
+# If NOT self-ballooning but xenbus is enabled, number of seconds between
+# checks/adjustments. May be overridden by {memory/interval}
+XENBALLOON_INTERVAL=1
+
+## Type: integer (must be > 0)
+## Default: 10
+#
+# When current > target, reduces rate at which target memory is ballooned
+# out.  For a value of n, 1/n of the difference will be ballooned.
+# This value applies both to selfballooning and directed ballooning.
+# May be overridden by {memory/downhysteresis}
+XENBALLOON_AUTO_DOWNHYSTERESIS=10
+
+## Type: integer (must be > 0)
+## Default: 1
+#
+# When current < target, reduces rate at which target memory is reclaimed
+# (if available).  For a value of n, 1/n of the difference will be ballooned.
+# This value applies both to selfballooning and directed ballooning.
+# May be overridden by {memory/uphysteresis}
+XENBALLOON_AUTO_UPHYSTERESIS=1
+
+## Type: integer (must be >= 0)
+## Default: 0
+#
+# In order to avoid ballooning so much memory that a guest experiences
+# out-of-memory errors (OOMs), memory will not be ballooned out below
+# a minimum target, in MB.  If this value is 0 (default), an heuristic
+# based on the maximum amount of memory will be used.  (The heuristic
+# provides the same minimum as recent versions of the balloon driver but
+# early versions of the balloon driver did not enforce a minimum.)
+XENBALLOON_MINMEM=0
+
+## Type: string
+## Default: "/var/run/xenballoon-maxmem"
+#
+# Location where memory high-water mark is stored; if a guest supports
+# hot-add memory, maxmem might increase across time and the minimum
+# target heuristic is based on max memory. NOTE: Reboot after changing
+# this variable, else overballooning may occur.
+XENBALLOON_MAXMEMFILE=/var/run/xenballoon-maxmem
+
+## Type: integer (0 or 1)
+## Default: 1
+#
+# If xenbus is enabled, whether selfballooning or directed ballooning,
+# place the result of 'cat /proc/meminfo" on xenbus at memory/meminfo
+XENBALLOON_SEND_MEMINFO=1
+
+## Type: integer (0 or 1)
+## Default: 1
+#
+# If xenbus is enabled, whether selfballooning or directed ballooning,
+# place the result of 'cat /proc/vmstat" on xenbus at memory/vmstat
+XENBALLOON_SEND_VMSTAT=1
+
+## Type: integer (0 or 1)
+## Default: 1
+#
+# If xenbus is enabled, whether selfballooning or directed ballooning,
+# place the result of 'cat /proc/uptime" on xenbus at memory/uptime
+XENBALLOON_SEND_UPTIME=1
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoond      Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,205 @@
+#!/bin/bash
+#
+# Copyright (C) 2008 Oracle Corporation and/or its affiliates.
+# All rights reserved.
+# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+#
+# xenballoond - In-guest engine for Xen memory ballooning
+# Version: 080630
+#
+# Two "policies" are implemented:
+# - Selfballooning: Adjust memory periodically, with no (or little) input
+#     from domain0.  Target memory is determined solely by the
+#     Committed_AS line in /proc/meminfo, but parameters may adjust
+#     the rate at which the target is achieved.
+# - Directed ballooning: Adjust memory solely as directed by domain0
+#
+# Under some circumstances, "output" may also be generated; the contents
+# of /proc/meminfo and /proc/vmstat may be periodically placed on xenbus.
+#
+# If xenbus is running and the /usr/bin/xenstore-* tools are installed,
+# "xenbus is enabled".
+#
+# Parameters are documented in /etc/sysconfig/xenballoon.conf. Although 
+# some are not used with directed ballooning, all must be set properly.
+# If xenbus is enabled, some of these parameters may be overridden by values
+# set by domain0 via xenbus.
+
+minmb() {
+       RETVAL=$XENBALLOON_MINMEM
+       if [ $RETVAL -ne 0 ]; then
+               return $RETVAL
+       fi
+       kb=`cat $XENBALLOON_MAXMEMFILE`
+       let "mb=$kb/1024"
+       let "pages=$kb/4"
+       # this algorithm from drivers/xen/balloon/balloon.c:minimum_target()
+       # which was added to balloon.c in 2008 to avoid ballooning too small
+       # it is unnecessary here except to accomodate pre-2008 balloon drivers
+       # note that ranges are adjusted because a VM with "memory=1024"
+       # gets somewhat less than 1024MB
+       if [ $mb -lt 125 ]; then
+               let RETVAL="$(( 8 + ($pages >> 9) ))"
+       elif [ $mb -lt 500 ]; then
+               let RETVAL="$(( 40 + ($pages >> 10) ))"
+       elif [ $mb -lt 2000 ]; then
+               let RETVAL="$(( 104 + ($pages >> 11) ))"
+       else
+               let RETVAL="$(( 296 + ($pages >> 13) ))"
+       fi
+       return  # value returned in RETVAL in mB
+}
+
+curkb() {
+       kb=`grep MemTotal /proc/meminfo | sed 's/  */ /' | \
+               cut -f2 -d' '`
+       RETVAL=$kb
+       return  # value returned in RETVAL in kB
+}
+
+downhysteresis() {
+       RETVAL=$XENBALLOON_AUTO_DOWNHYSTERESIS
+       if [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/downhysteresis ; then
+                       RETVAL=`xenstore-read memory/downhysteresis`
+               fi
+       fi
+       return
+}
+
+uphysteresis() {
+       RETVAL=$XENBALLOON_AUTO_UPHYSTERESIS
+       if [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/uphysteresis ; then
+                       RETVAL=`xenstore-read memory/uphysteresis`
+               fi
+       fi
+       return
+}
+
+selfballoon_eval() {
+       if [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/selfballoon; then
+                       RETVAL=`xenstore-read memory/selfballoon`
+                       if [ $RETVAL -eq 1 ]; then
+                               selfballoon_enabled=true
+                               return
+                       fi
+               fi
+       fi
+       selfballoon_enabled=$XENBALLOON_SELF
+       return
+}
+
+selftarget() {
+       tgtkb=`grep Committed_AS /proc/meminfo | sed 's/  */ /' | cut -f2 -d' '`
+       minmb
+       let "minbytes=$RETVAL*1024*1024"
+       let "tgtbytes=$tgtkb*1024"
+       if [ $tgtbytes -lt $minbytes ]; then
+               let "tgtbytes=$minbytes"
+       fi
+       RETVAL=$tgtbytes  # value returned in RETVAL in bytes
+       return
+}
+
+# $1 == 1 means use selftarget, else target in kB
+balloon_to_target() {
+       if [ "$1" -eq 1 ]; then
+               selftarget
+               tgtbytes=$RETVAL
+       else
+               let "tgtbytes=$(( $1 * 1024 ))"
+       fi
+       curkb
+       let "curbytes=$RETVAL*1024"
+       if [ $curbytes -gt $tgtbytes ]; then
+               downhysteresis
+               downhys=$RETVAL
+               if [ $downhys -ne 0 ]; then
+                       let "tgtbytes=$(( $curbytes - \
+                               ( ( $curbytes - $tgtbytes ) / $downhys ) ))"
+               fi
+       else if [ $curbytes -lt $tgtbytes ]; then
+               uphysteresis
+               uphys=$RETVAL
+               let "tgtbytes=$(( $curbytes + \
+                               ( ( $tgtbytes - $curbytes ) / $uphys ) ))"
+               fi
+       fi
+       echo $tgtbytes > /proc/xen/balloon
+       if [ $xenstore_enabled = "true" ]; then
+               let "tgtkb=$(( $tgtbytes/1024 ))"
+               xenstore-write memory/selftarget $tgtkb
+       fi
+}
+
+send_memory_stats() {
+       if [ ! $xenstore_enabled = "true" ]; then
+               return
+       fi
+       if [ $XENBALLOON_SEND_MEMINFO ]; then
+               xenstore-write memory/meminfo "`cat /proc/meminfo`"
+       fi
+       if [ $XENBALLOON_SEND_VMSTAT ]; then
+               xenstore-write memory/vmstat "`cat /proc/vmstat`"
+       fi
+       if [ $XENBALLOON_SEND_UPTIME ]; then
+               xenstore-write memory/uptime "`cat /proc/uptime`"
+       fi
+}
+
+if [ ! -f /proc/xen/balloon ]; then
+       echo "$0: no balloon driver installed"
+       exit 0
+fi
+if [ ! -f /proc/meminfo ]; then
+       echo "$0: can't read /proc/meminfo"
+       exit 0
+fi
+xenstore_enabled=true
+if [ -f /usr/bin/xenstore-exists -a -f /usr/bin/xenstore-read -a \
+     -f /usr/bin/xenstore-write ]; then
+       xenstore_enabled=true
+else
+       echo "$0: missing /usr/bin/xenstore-* tools, disabling directed 
ballooning"
+       xenstore_enabled=false
+fi
+
+. /etc/sysconfig/xenballoon.conf
+
+while true;
+do
+       # handle special case for PV domains with hot-add memory
+       if [ ! -f $XENBALLOON_MAXMEMFILE ]; then
+               maxkb=0
+       else
+               maxkb=`cat $XENBALLOON_MAXMEMFILE`
+       fi
+       curkb=`grep MemTotal /proc/meminfo | sed 's/  */ /' | cut -f2 -d' '`
+       if [ $curkb -gt $maxkb ]; then
+               echo $curkb > $XENBALLOON_MAXMEMFILE
+       fi
+       interval=$XENBALLOON_INTERVAL
+       # do self-ballooning
+       selfballoon_eval
+       if [ $selfballoon_enabled = "true" ]; then
+               balloon_to_target 1
+               interval=$XENBALLOON_SELF_INTERVAL
+       # or do directed ballooning
+       elif [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/target ; then
+                       tgtkb=`xenstore-read memory/target`
+                       balloon_to_target $tgtkb
+               fi
+               interval=$XENBALLOON_INTERVAL
+       fi
+       send_memory_stats
+       if [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/interval ; then
+                       interval=`xenstore-read memory/interval`
+               fi
+       fi
+       sleep $interval
+done &
+
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond.README
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoond.README       Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,82 @@
+Xenballoond.README
+Preliminary version 0.1, 2008/06/30
+
+Copyright (C) 2008 Oracle Corporation and/or its affiliates.
+All rights reserved.
+Written by Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+
+INTRODUCTION
+
+Xenballoond runs in guest domains and both implements selfballooning and
+provides metrics to dom0 for (future) directed ballooning.  Both capabilities
+provide a foundation for basic "memory overcommit" functionality.
+
+With selfballooning enabled, xenballoond uses the Committed_AS value found
+in /proc/meminfo as a first approximation of how much memory is required
+by the guest and feeds this statistic back to the balloon driver to inflate
+or deflate the balloon as required to achieve the target guest memory size.
+Hysteresis parameters may be adjusted to rate-limit balloon inflation
+and deflation.
+
+If configured, certain selfballooning parameters -- including notably
+enabling/disabling of self-ballooning -- can be controlled from domain0.
+(These are fully documented in xenballoon.conf.)
+
+If configured, the following guest statistics are sent back to domain0:
+- /proc/meminfo
+- /proc/vmstat
+- /proc/uptime
+In a future release, some of these values will be used by a policy module
+in domain0 to control guest balloon size and provide memory balancing
+across all guests on a given system.
+
+Note that no page sharing (content-based or otherwise) is implemented
+and no VMM-based swapping is necessary.
+
+For more information, see:
+http://www.xen.org/files/xensummitboston08/MemoryOvercommit-XenSummit2008.pdf
+http://wiki.xensource.com/xenwiki/Open_Topics_For_Discussion?action=AttachFile&do=get&target=Memory+Overcommit.pdf
+
+INSTALLATION AND DEPLOYMENT
+
+In this preliminary release:
+- directed ballooning is not implemented, though a monitor is provided
+- only Redhat-based guests are supported
+
+Guest prerequisites to use xenballoond:
+- each guest must be configured with adequate[1] swap space
+- each guest must have the balloon driver installed (/proc/xen/balloon exists) 
+- if directed ballooning (or monitoring) is desired, xenstore tools must be
+  installed in each guest in /usr/bin [2]
+
+[1] for best results, for a guest that is configured with maxmem=N and
+    requires Z MB of swap space without xenballoond, available swap should
+    be increased to N+Z MB when xenballoond is running
+[2] specifically xenstore-read, xenstore-exists, and xenstore-write must
+    be installed.  Binaries can be obtained, for example, by building
+    xen-vvv.gz/tools in a guest-binary-compatible development tree
+
+Instructions to install/deploy xenballoond (in Redhat-based system):
+- in each guest:
+  - ensure pre-requisites are met (see above)
+  - place xenballoon.conf in /etc/sysconfig
+  - place xenballoond in /usr/sbin
+  - copy xenballoond.init to /etc/rc.d/init.d/xenballoond (note file rename)
+  - edit /etc/sysconfig/xenballoond.conf as desired (especially note that
+    selfballooning defaults as off)
+  - start xenballoond with "service xenballoond start", and/or configure
+    xenballoond to start at init (e.g. "chkconfig xenballoond on")
+- in domain0:
+  - if monitoring is desired, xenballoon-monitor may be installed in /usr/sbin
+- note that certain xenballoond.conf variables may be overridden by domain0
+  if xenstore is running in the guest; these are fully documented in
+  xenballoond.conf
+
+TODO:
+080630 modifications to support SUSE-based and debian-based guests
+080630 domain0 ballooning policy module
+080630 experiment with more aggressive (optionally) memory minimum targets
+080630 BUG: xenballoond doesn't properly record the fact that it's running;
+       e.g. flipping between run levels 5 and 3 launches additional daemons
+080630 BUG: reports of possible incompatibilites between ballooning and
+       save/restore/migrate have not been duplicated
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond.init
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoond.init Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,91 @@
+#!/bin/bash
+#
+# xenballoond  Script to start and stop Xen ballooning daemon.
+#
+# Copyright (C) 2008 Oracle Corporation and/or its affiliates.
+# All rights reserved.
+# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+#
+# chkconfig: 2345 98 01
+# description: Starts and stops the Xen control daemon.
+### BEGIN INIT INFO
+# Provides:          xenballoond
+# Required-Start:    $syslog $remote_fs
+# Should-Start:
+# Required-Stop:     $syslog $remote_fs
+# Should-Stop:
+# Default-Start:     3 4 5
+# Default-Stop:      0 1 2 6
+# Default-Enabled:   yes
+# Short-Description: Start/stop xend
+# Description:       Starts and stops the Xen ballooning daemon.
+### END INIT INFO
+
+# Source function library
+. /etc/init.d/functions
+
+#don't use in domain0
+[ -f /proc/xen/capabilities ] && \
+       grep -q "control_d" /proc/xen/capabilities && exit 0
+
+if [ -f /etc/sysconfig/xenballoon.conf ]; then
+       . /etc/sysconfig/xenballoon.conf
+fi
+
+# Check that balloon driver is present
+[ ! -f /proc/xen/balloon ] && exit 0
+
+# Record original memory (in kB)
+[ -z "$XENBALLOON_MAXMEMFILE" ] && exit 0
+let maxmem=`grep MemTotal /proc/meminfo | sed 's/  */ /' | cut -f2 -d' '`
+if [ -f "$XENBALLOON_MAXMEMFILE" ]; then
+       let oldmax=`cat $XENBALLOON_MAXMEMFILE`
+       if [ $oldmax -gt $maxmem ]; then
+               let maxmem=oldmax
+       fi
+fi
+echo $maxmem > $XENBALLOON_MAXMEMFILE
+
+RETVAL=0
+prog="xenballoond"
+
+start() {
+        # Start daemons.
+        echo -n $"Starting $prog: "
+        daemon xenballoond $OPTIONS
+       RETVAL=$?
+        echo
+       return $RETVAL
+}
+
+stop() {
+        echo -n $"Shutting down $prog: "
+       killproc xenballoond
+       RETVAL=$?
+        echo
+       return $RETVAL
+}
+
+# See how we were called.
+case "$1" in
+  start)
+       start
+        ;;
+  stop)
+       stop
+        ;;
+  status)
+       status xenballoond
+       RETVAL=$?
+       ;;
+  restart|reload)
+       stop
+       start
+       RETVAL=$?
+       ;;
+  *)
+        echo $"Usage: $0 {start|stop|restart|status}"
+        exit 1
+esac
+
+exit $RETVAL
diff -r 11318234588e -r 08f77df14cba tools/xentrace/xenctx.c
--- a/tools/xentrace/xenctx.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/xentrace/xenctx.c   Wed Jul 02 11:30:37 2008 +0900
@@ -702,7 +702,7 @@ void dump_ctx(int vcpu)
 void dump_ctx(int vcpu)
 {
     int ret;
-    vcpu_guest_context_t ctx;
+    vcpu_guest_context_any_t ctx;
     xc_dominfo_t dominfo;
 
     xc_handle = xc_interface_open(); /* for accessing control interface */
@@ -727,10 +727,10 @@ void dump_ctx(int vcpu)
         exit(-1);
     }
 
-    print_ctx(&ctx);
+    print_ctx(&ctx.c);
 #ifndef NO_TRANSLATION
-    if (is_kernel_text(INSTR_POINTER((&ctx.user_regs))))
-        print_stack(&ctx, vcpu);
+    if (is_kernel_text(INSTR_POINTER((&ctx.c.user_regs))))
+        print_stack(&ctx.c, vcpu);
 #endif
 
     if (!dominfo.paused) {
diff -r 11318234588e -r 08f77df14cba tools/xm-test/lib/XmTestLib/block_utils.py
--- a/tools/xm-test/lib/XmTestLib/block_utils.py        Thu Jun 19 12:48:04 
2008 +0900
+++ b/tools/xm-test/lib/XmTestLib/block_utils.py        Wed Jul 02 11:30:37 
2008 +0900
@@ -15,7 +15,7 @@ __all__ = [ "block_attach", "block_detac
 
 
 def get_state(domain, devname):
-    number = xen.util.blkif.blkdev_name_to_number(devname)
+    (path, number) = xen.util.blkif.blkdev_name_to_number(devname)
     s, o = traceCommand("xm block-list %s | awk '/^%d/ {print $4}'" %
                         (domain.getName(), number))
     if s != 0:
diff -r 11318234588e -r 08f77df14cba xen/arch/ia64/vmx/vmx_hypercall.c
--- a/xen/arch/ia64/vmx/vmx_hypercall.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/ia64/vmx/vmx_hypercall.c Wed Jul 02 11:30:37 2008 +0900
@@ -204,6 +204,53 @@ do_hvm_op(unsigned long op, XEN_GUEST_HA
         rc = -ENOSYS;
         break;
 
+    case HVMOP_modified_memory:
+    {
+        struct xen_hvm_modified_memory a;
+        struct domain *d;
+        unsigned long pfn;
+
+        if ( copy_from_guest(&a, arg, 1) )
+            return -EFAULT;
+
+        if ( a.domid == DOMID_SELF )
+        {
+            d = rcu_lock_current_domain();
+        }
+        else
+        {
+            if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
+                return -ESRCH;
+            if ( !IS_PRIV_FOR(current->domain, d) )
+            {
+                rc = -EPERM;
+                goto param_fail3;
+            }
+        }
+
+        rc = -EINVAL;
+        if ( !is_hvm_domain(d) )
+            goto param_fail3;
+
+        rc = -EINVAL;
+        if ( a.first_pfn > domain_get_maximum_gpfn(d)
+                || a.first_pfn + a.nr - 1 < a.first_pfn
+                || a.first_pfn + a.nr - 1 > domain_get_maximum_gpfn(d))
+            goto param_fail3;
+
+        rc = 0;
+        if ( !d->arch.shadow_bitmap )
+            goto param_fail3;
+
+        for (pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++)
+            if (pfn < d->arch.shadow_bitmap_size)
+                set_bit(pfn, d->arch.shadow_bitmap);
+
+    param_fail3:
+        rcu_unlock_domain(d);
+        break;
+    }
+
     default:
         gdprintk(XENLOG_INFO, "Bad HVM op %ld.\n", op);
         rc = -ENOSYS;
diff -r 11318234588e -r 08f77df14cba xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/ia64/xen/mm.c    Wed Jul 02 11:30:37 2008 +0900
@@ -207,7 +207,7 @@ alloc_dom_xen_and_dom_io(void)
      * Any Xen-heap pages that we will allow to be mapped will have
      * their domain field set to dom_xen.
      */
-    dom_xen = alloc_domain(DOMID_XEN);
+    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
     BUG_ON(dom_xen == NULL);
 
     /*
@@ -215,7 +215,7 @@ alloc_dom_xen_and_dom_io(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = alloc_domain(DOMID_IO);
+    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
     BUG_ON(dom_io == NULL);
 }
 
@@ -1553,7 +1553,7 @@ expose_p2m_init(void)
      * Initialise our DOMID_P2M domain.
      * This domain owns m2p table pages.
      */
-    dom_p2m = alloc_domain(DOMID_P2M);
+    dom_p2m = domain_create(DOMID_P2M, DOMCRF_dummy, 0);
     BUG_ON(dom_p2m == NULL);
     dom_p2m->max_pages = ~0U;
 
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/Makefile
--- a/xen/arch/x86/acpi/cpufreq/Makefile        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/Makefile        Wed Jul 02 11:30:37 2008 +0900
@@ -1,3 +1,4 @@ obj-y += cpufreq.o
 obj-y += cpufreq.o
 obj-y += utility.o
 obj-y += cpufreq_ondemand.o
+obj-y += powernow.o
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/cpufreq.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c       Wed Jul 02 11:30:37 2008 +0900
@@ -47,6 +47,10 @@ struct processor_pminfo processor_pminfo
 struct processor_pminfo processor_pminfo[NR_CPUS];
 struct cpufreq_policy xen_px_policy[NR_CPUS];
 
+static cpumask_t *cpufreq_dom_pt;
+static cpumask_t cpufreq_dom_mask;
+static unsigned int cpufreq_dom_max;
+
 enum {
     UNDEFINED_CAPABLE = 0,
     SYSTEM_INTEL_MSR_CAPABLE,
@@ -60,7 +64,6 @@ struct acpi_cpufreq_data {
     struct processor_performance *acpi_data;
     struct cpufreq_frequency_table *freq_table;
     unsigned int max_freq;
-    unsigned int resume;
     unsigned int cpu_feature;
 };
 
@@ -328,14 +331,16 @@ static int acpi_cpufreq_target(struct cp
 
     next_perf_state = data->freq_table[next_state].index;
     if (perf->state == next_perf_state) {
-        if (unlikely(data->resume)) {
-            printk("xen_pminfo: @acpi_cpufreq_target, "
-                "Called after resume, resetting to P%d\n", 
+        if (unlikely(policy->resume)) {
+            printk(KERN_INFO "Called after resume, resetting to P%d\n", 
                 next_perf_state);
-            data->resume = 0;
+            policy->resume = 0;
         }
-        else
+        else {
+            printk(KERN_INFO "Already at target state (P%d)\n", 
+                next_perf_state);
             return 0;
+        }
     }
 
     switch (data->cpu_feature) {
@@ -531,7 +536,7 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
      * the first call to ->target() should result in us actually
      * writing something to the appropriate registers.
      */
-    data->resume = 1;
+    policy->resume = 1;
 
     return result;
 
@@ -549,61 +554,101 @@ static struct cpufreq_driver acpi_cpufre
     .init   = acpi_cpufreq_cpu_init,
 };
 
-int acpi_cpufreq_init(void)
-{
-    unsigned int i, ret = 0;
-    unsigned int dom, max_dom = 0;
-    cpumask_t *pt, dom_mask;
-
-    cpus_clear(dom_mask);
+void cpufreq_dom_exit(void)
+{
+    cpufreq_dom_max = 0;
+    cpus_clear(cpufreq_dom_mask);
+    if (cpufreq_dom_pt)
+        xfree(cpufreq_dom_pt);
+}
+
+int cpufreq_dom_init(void)
+{
+    unsigned int i;
+
+    cpufreq_dom_max = 0;
+    cpus_clear(cpufreq_dom_mask);
 
     for_each_online_cpu(i) {
-        cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask);
-        if (max_dom < processor_pminfo[i].perf.domain_info.domain)
-            max_dom = processor_pminfo[i].perf.domain_info.domain;
-    }
-    max_dom++;
-
-    pt = xmalloc_array(cpumask_t, max_dom);
-    if (!pt)
+        cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask);
+        if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain)
+            cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain;
+    }
+    cpufreq_dom_max++;
+
+    cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max);
+    if (!cpufreq_dom_pt)
         return -ENOMEM;
-    memset(pt, 0, max_dom * sizeof(cpumask_t));
-
-    /* get cpumask of each psd domain */
+    memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t));
+
     for_each_online_cpu(i)
-        cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]);
+        cpu_set(i, 
cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]);
 
     for_each_online_cpu(i)
-        processor_pminfo[i].perf.shared_cpu_map = 
-            pt[processor_pminfo[i].perf.domain_info.domain];
-
-    cpufreq_driver = &acpi_cpufreq_driver;
-
-    /* setup cpufreq infrastructure */
+        processor_pminfo[i].perf.shared_cpu_map =
+            cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain];
+
+    return 0;
+}
+
+static int cpufreq_cpu_init(void)
+{
+    int i, ret = 0;
+
     for_each_online_cpu(i) {
         xen_px_policy[i].cpu = i;
 
         ret = px_statistic_init(i);
         if (ret)
-            goto out;
+            return ret;
 
         ret = acpi_cpufreq_cpu_init(&xen_px_policy[i]);
         if (ret)
-            goto out;
-    }
-
-    /* setup ondemand cpufreq */
-    for (dom=0; dom<max_dom; dom++) {
-        if (!cpu_isset(dom, dom_mask))
+            return ret;
+    }
+    return ret;
+}
+
+int cpufreq_dom_dbs(unsigned int event)
+{
+    int cpu, dom, ret = 0;
+
+    for (dom=0; dom<cpufreq_dom_max; dom++) {
+        if (!cpu_isset(dom, cpufreq_dom_mask))
             continue;
-        i = first_cpu(pt[dom]);
-        ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
+        cpu = first_cpu(cpufreq_dom_pt[dom]);
+        ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event);
         if (ret)
-            goto out;
-    }
-
-out:
-    xfree(pt);
-   
+            return ret;
+    }
     return ret;
 }
+
+int acpi_cpufreq_init(void)
+{
+    int ret = 0;
+    
+    /* setup cpumask of psd dom and shared cpu map of cpu */
+    ret = cpufreq_dom_init();
+    if (ret)
+        goto err;
+
+    /* setup cpufreq driver */
+    cpufreq_driver = &acpi_cpufreq_driver;
+
+    /* setup cpufreq infrastructure */
+    ret = cpufreq_cpu_init();
+    if (ret)
+        goto err;
+
+    /* setup cpufreq dbs according to dom coordiation */
+    ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
+    if (ret)
+        goto err;
+
+    return ret;
+
+err:
+    cpufreq_dom_exit();
+    return ret;
+}
diff -r 11318234588e -r 08f77df14cba 
xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c      Thu Jun 19 12:48:04 
2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c      Wed Jul 02 11:30:37 
2008 +0900
@@ -52,7 +52,7 @@ static struct dbs_tuners {
 
 static struct timer dbs_timer[NR_CPUS];
 
-static inline uint64_t get_cpu_idle_time(unsigned int cpu)
+inline uint64_t get_cpu_idle_time(unsigned int cpu)
 {
     uint64_t idle_ns;
     struct vcpu *v;
@@ -79,6 +79,12 @@ static void dbs_check_cpu(struct cpu_dbs
         return;
 
     policy = this_dbs_info->cur_policy;
+
+    if (unlikely(policy->resume)) {
+        __cpufreq_driver_target(policy, policy->max,CPUFREQ_RELATION_H);
+        return;
+    }
+
     cur_ns = NOW();
     total_ns = cur_ns - this_dbs_info->prev_cpu_wall;
     this_dbs_info->prev_cpu_wall = NOW();
@@ -217,8 +223,7 @@ int cpufreq_governor_dbs(struct cpufreq_
         break;
 
     case CPUFREQ_GOV_STOP:
-        if (this_dbs_info->enable)
-            dbs_timer_exit(this_dbs_info);
+        dbs_timer_exit(this_dbs_info);
         dbs_enable--;
 
         break;
@@ -233,5 +238,4 @@ int cpufreq_governor_dbs(struct cpufreq_
         break;
     }
     return 0;
-}
-             
+} 
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/powernow.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c      Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,305 @@
+/*
+ *  powernow - AMD Architectural P-state Driver ($Revision: 1.4 $)
+ *
+ *  Copyright (C) 2008 Mark Langsdorf <mark.langsdorf@xxxxxxx>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/delay.h>
+#include <xen/cpumask.h>
+#include <xen/timer.h>
+#include <xen/xmalloc.h>
+#include <asm/bug.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/config.h>
+#include <asm/processor.h>
+#include <asm/percpu.h>
+#include <asm/cpufeature.h>
+#include <acpi/acpi.h>
+#include <acpi/cpufreq/cpufreq.h>
+
+#define CPUID_FREQ_VOLT_CAPABILITIES    0x80000007
+#define USE_HW_PSTATE           0x00000080
+#define HW_PSTATE_MASK          0x00000007
+#define HW_PSTATE_VALID_MASK    0x80000000
+#define HW_PSTATE_MAX_MASK      0x000000f0
+#define HW_PSTATE_MAX_SHIFT     4
+#define MSR_PSTATE_DEF_BASE     0xc0010064 /* base of Pstate MSRs */
+#define MSR_PSTATE_STATUS       0xc0010063 /* Pstate Status MSR */
+#define MSR_PSTATE_CTRL         0xc0010062 /* Pstate control MSR */
+#define MSR_PSTATE_CUR_LIMIT    0xc0010061 /* pstate current limit MSR */
+
+extern struct processor_pminfo processor_pminfo[NR_CPUS];
+extern struct cpufreq_policy xen_px_policy[NR_CPUS];
+
+struct powernow_cpufreq_data {
+    struct processor_performance *acpi_data;
+    struct cpufreq_frequency_table *freq_table;
+    unsigned int max_freq;
+    unsigned int resume;
+    unsigned int cpu_feature;
+};
+
+static struct powernow_cpufreq_data *drv_data[NR_CPUS];
+
+struct drv_cmd {
+    unsigned int type;
+    cpumask_t mask;
+    u64 addr;
+    u32 val;
+};
+
+static void transition_pstate(void *drvcmd)
+{
+    struct drv_cmd *cmd;
+    cmd = (struct drv_cmd *) drvcmd;
+
+    wrmsr(MSR_PSTATE_CTRL, cmd->val, 0);
+}
+
+static int powernow_cpufreq_target(struct cpufreq_policy *policy,
+                               unsigned int target_freq, unsigned int relation)
+{
+    struct powernow_cpufreq_data *data = drv_data[policy->cpu];
+    struct processor_performance *perf;
+    struct cpufreq_freqs freqs;
+    cpumask_t online_policy_cpus;
+    struct drv_cmd cmd;
+    unsigned int next_state = 0; /* Index into freq_table */
+    unsigned int next_perf_state = 0; /* Index into perf table */
+    int result = 0;
+
+    if (unlikely(data == NULL ||
+        data->acpi_data == NULL || data->freq_table == NULL)) {
+        return -ENODEV;
+    }
+
+    perf = data->acpi_data;
+    result = cpufreq_frequency_table_target(policy,
+                                            data->freq_table,
+                                            target_freq,
+                                            relation, &next_state);
+    if (unlikely(result))
+        return -ENODEV;
+
+    online_policy_cpus = policy->cpus;
+
+    next_perf_state = data->freq_table[next_state].index;
+    if (perf->state == next_perf_state) {
+        if (unlikely(data->resume)) 
+            data->resume = 0;
+        else
+            return 0;
+    }
+
+    cpus_clear(cmd.mask);
+
+    if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
+        cmd.mask = online_policy_cpus;
+    else
+        cpu_set(policy->cpu, cmd.mask);
+
+    freqs.old = perf->states[perf->state].core_frequency * 1000;
+    freqs.new = data->freq_table[next_state].frequency;
+
+    cmd.val = next_perf_state;
+
+    on_selected_cpus( cmd.mask, transition_pstate, (void *) &cmd, 0, 0);
+
+    perf->state = next_perf_state;
+    policy->cur = freqs.new;
+
+    return result;
+}
+
+static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+    unsigned int i;
+    unsigned int valid_states = 0;
+    unsigned int cpu = policy->cpu;
+    struct powernow_cpufreq_data *data;
+    unsigned int result = 0;
+    struct processor_performance *perf;
+    u32 max_hw_pstate, hi = 0, lo = 0;
+
+    data = xmalloc(struct powernow_cpufreq_data);
+    if (!data)
+        return -ENOMEM;
+    memset(data, 0, sizeof(struct powernow_cpufreq_data));
+
+    drv_data[cpu] = data;
+
+    data->acpi_data = &processor_pminfo[cpu].perf;
+
+    perf = data->acpi_data;
+    policy->shared_type = perf->shared_type;
+
+    /*
+     * Will let policy->cpus know about dependency only when software
+     * coordination is required.
+     */
+    if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
+        policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+        policy->cpus = perf->shared_cpu_map;
+    } else {
+        policy->cpus = cpumask_of_cpu(cpu);    
+    }
+
+    /* capability check */
+    if (perf->state_count <= 1) {
+        printk("No P-States\n");
+        result = -ENODEV;
+        goto err_unreg;
+    }
+    rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
+    max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
+
+    if (perf->control_register.space_id != perf->status_register.space_id) {
+        result = -ENODEV;
+        goto err_unreg;
+    }
+
+    data->freq_table = xmalloc_array(struct cpufreq_frequency_table, 
+                                    (perf->state_count+1));
+    if (!data->freq_table) {
+        result = -ENOMEM;
+        goto err_unreg;
+    }
+
+    /* detect transition latency */
+    policy->cpuinfo.transition_latency = 0;
+    for (i=0; i<perf->state_count; i++) {
+        if ((perf->states[i].transition_latency * 1000) >
+            policy->cpuinfo.transition_latency)
+            policy->cpuinfo.transition_latency =
+                perf->states[i].transition_latency * 1000;
+    }
+
+    data->max_freq = perf->states[0].core_frequency * 1000;
+    /* table init */
+    for (i=0; i<perf->state_count && i<max_hw_pstate; i++) {
+        if (i>0 && perf->states[i].core_frequency >=
+            data->freq_table[valid_states-1].frequency / 1000)
+            continue;
+
+        data->freq_table[valid_states].index = perf->states[i].control & 
HW_PSTATE_MASK;
+        data->freq_table[valid_states].frequency =
+            perf->states[i].core_frequency * 1000;
+        valid_states++;
+    }
+    data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
+    perf->state = 0;
+
+    result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
+    if (result)
+        goto err_freqfree;
+
+    /*
+     * the first call to ->target() should result in us actually
+     * writing something to the appropriate registers.
+     */
+    data->resume = 1;
+
+    policy->cur = data->freq_table[i].frequency;
+    return result;
+
+err_freqfree:
+    xfree(data->freq_table);
+err_unreg:
+    xfree(data);
+    drv_data[cpu] = NULL;
+
+    return result;
+}
+
+static struct cpufreq_driver powernow_cpufreq_driver = {
+    .target = powernow_cpufreq_target,
+    .init   = powernow_cpufreq_cpu_init,
+};
+
+int powernow_cpufreq_init(void)
+{
+    unsigned int i, ret = 0;
+    unsigned int dom, max_dom = 0;
+    cpumask_t *pt, dom_mask;
+
+    cpus_clear(dom_mask);
+
+    for_each_online_cpu(i) {
+        struct cpuinfo_x86 *c = &cpu_data[i];
+       if (c->x86_vendor != X86_VENDOR_AMD)
+            ret = -ENODEV;
+        else 
+        {
+            u32 eax, ebx, ecx, edx;
+            cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+            if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE)
+                ret = -ENODEV;
+       }
+        if (ret)
+            return ret;
+        cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask);
+        if (max_dom < processor_pminfo[i].perf.domain_info.domain)
+            max_dom = processor_pminfo[i].perf.domain_info.domain;
+    }
+    max_dom++;
+
+    pt = xmalloc_array(cpumask_t, max_dom);
+    if (!pt)
+        return -ENOMEM;
+    memset(pt, 0, max_dom * sizeof(cpumask_t));
+
+    /* get cpumask of each psd domain */
+    for_each_online_cpu(i)
+        cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]);
+
+    for_each_online_cpu(i)
+        processor_pminfo[i].perf.shared_cpu_map = 
+            pt[processor_pminfo[i].perf.domain_info.domain];
+
+    cpufreq_driver = &powernow_cpufreq_driver;
+
+    /* setup cpufreq infrastructure */
+    for_each_online_cpu(i) {
+        xen_px_policy[i].cpu = i;
+
+        ret = powernow_cpufreq_cpu_init(&xen_px_policy[i]);
+        if (ret)
+            goto cpufreq_init_out;
+    }
+
+    /* setup ondemand cpufreq */
+    for (dom=0; dom<max_dom; dom++) {
+        if (!cpu_isset(dom, dom_mask))
+            continue;
+        i = first_cpu(pt[dom]);
+        ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
+        if (ret)
+            goto cpufreq_init_out;
+    }
+
+cpufreq_init_out:
+    xfree(pt);
+   
+    return ret;
+}
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/utility.c
--- a/xen/arch/x86/acpi/cpufreq/utility.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/utility.c       Wed Jul 02 11:30:37 2008 +0900
@@ -37,6 +37,41 @@ struct cpufreq_driver *cpufreq_driver;
  *                    Px STATISTIC INFO                              *
  *********************************************************************/
 
+void px_statistic_suspend(void)
+{
+    int cpu;
+    uint64_t now;
+
+    now = NOW();
+
+    for_each_online_cpu(cpu) {
+        struct pm_px *pxpt = &px_statistic_data[cpu];
+        uint64_t total_idle_ns;
+        uint64_t tmp_idle_ns;
+
+        total_idle_ns = get_cpu_idle_time(cpu);
+        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
+
+        pxpt->u.pt[pxpt->u.cur].residency +=
+                    now - pxpt->prev_state_wall;
+        pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
+    }
+}
+
+void px_statistic_resume(void)
+{
+    int cpu;
+    uint64_t now;
+
+    now = NOW();
+
+    for_each_online_cpu(cpu) {
+        struct pm_px *pxpt = &px_statistic_data[cpu];
+        pxpt->prev_state_wall = now;
+        pxpt->prev_idle_wall = get_cpu_idle_time(cpu);
+    }
+}
+
 void px_statistic_update(cpumask_t cpumask, uint8_t from, uint8_t to)
 {
     uint32_t i;
@@ -47,15 +82,22 @@ void px_statistic_update(cpumask_t cpuma
     for_each_cpu_mask(i, cpumask) {
         struct pm_px *pxpt = &px_statistic_data[i];
         uint32_t statnum = processor_pminfo[i].perf.state_count;
+        uint64_t total_idle_ns;
+        uint64_t tmp_idle_ns;
+
+        total_idle_ns = get_cpu_idle_time(i);
+        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
 
         pxpt->u.last = from;
         pxpt->u.cur = to;
         pxpt->u.pt[to].count++;
         pxpt->u.pt[from].residency += now - pxpt->prev_state_wall;
+        pxpt->u.pt[from].residency -= tmp_idle_ns;
 
         (*(pxpt->u.trans_pt + from*statnum + to))++;
 
         pxpt->prev_state_wall = now;
+        pxpt->prev_idle_wall = total_idle_ns;
     }
 }
 
@@ -87,6 +129,7 @@ int px_statistic_init(int cpuid)
         pxpt->u.pt[i].freq = pmpt->perf.states[i].core_frequency;
 
     pxpt->prev_state_wall = NOW();
+    pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
 
     return 0;
 }
@@ -107,6 +150,7 @@ void px_statistic_reset(int cpuid)
     }
 
     pxpt->prev_state_wall = NOW();
+    pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
 }
 
 
@@ -242,3 +286,62 @@ int __cpufreq_driver_getavg(struct cpufr
 
     return ret;
 }
+
+
+/*********************************************************************
+ *               CPUFREQ SUSPEND/RESUME                              *
+ *********************************************************************/
+
+void cpufreq_suspend(void)
+{
+    int cpu;
+
+    /* to protect the case when Px was controlled by dom0-kernel */
+    /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */
+    for_each_online_cpu(cpu) {
+        struct processor_performance *perf = &processor_pminfo[cpu].perf;
+
+        if (!perf->init)
+            return;
+    }
+
+    cpufreq_dom_dbs(CPUFREQ_GOV_STOP);
+
+    cpufreq_dom_exit();
+
+    px_statistic_suspend();
+}
+
+int cpufreq_resume(void)
+{
+    int cpu, ret = 0;
+
+    /* 1. to protect the case when Px was controlled by dom0-kernel */
+    /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */
+    /* 2. set state and resume flag to sync cpu to right state and freq */
+    for_each_online_cpu(cpu) {
+        struct processor_performance *perf = &processor_pminfo[cpu].perf;
+        struct cpufreq_policy *policy = &xen_px_policy[cpu];
+
+        if (!perf->init)
+            goto err;
+        perf->state = 0;
+        policy->resume = 1;
+    }
+
+    px_statistic_resume();
+
+    ret = cpufreq_dom_init();
+    if (ret)
+        goto err;
+
+    ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
+    if (ret)
+        goto err;
+
+    return ret;
+
+err:
+    cpufreq_dom_exit();
+    return ret;
+}
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/pmstat.c
--- a/xen/arch/x86/acpi/pmstat.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/pmstat.c        Wed Jul 02 11:30:37 2008 +0900
@@ -71,11 +71,18 @@ int do_get_pm_info(struct xen_sysctl_get
     case PMSTAT_get_pxstat:
     {
         uint64_t now, ct;
+        uint64_t total_idle_ns;
+        uint64_t tmp_idle_ns;
+
+        total_idle_ns = get_cpu_idle_time(op->cpuid);
+        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
 
         now = NOW();
         pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.ppc;
         pxpt->u.pt[pxpt->u.cur].residency += now - pxpt->prev_state_wall;
+        pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
         pxpt->prev_state_wall = now;
+        pxpt->prev_idle_wall = total_idle_ns;
 
         ct = pmpt->perf.state_count;
         if ( copy_to_guest(op->u.getpx.trans_pt, pxpt->u.trans_pt, ct*ct) )
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/power.c
--- a/xen/arch/x86/acpi/power.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/power.c Wed Jul 02 11:30:37 2008 +0900
@@ -27,7 +27,7 @@
 #include <public/platform.h>
 #include <asm/tboot.h>
 
-#define pmprintk(_l, _f, _a...) printk(_l "<PM> " _f "\n", ## _a )
+#include <acpi/cpufreq/cpufreq.h>
 
 static char opt_acpi_sleep[20];
 string_param("acpi_sleep", opt_acpi_sleep);
@@ -124,9 +124,11 @@ static int enter_state(u32 state)
     if ( !spin_trylock(&pm_lock) )
         return -EBUSY;
 
-    pmprintk(XENLOG_INFO, "Preparing system for ACPI S%d state.", state);
+    printk(XENLOG_INFO "Preparing system for ACPI S%d state.", state);
 
     freeze_domains();
+
+    cpufreq_suspend();
 
     disable_nonboot_cpus();
     if ( num_online_cpus() != 1 )
@@ -139,11 +141,14 @@ static int enter_state(u32 state)
 
     acpi_sleep_prepare(state);
 
+    console_start_sync();
+    printk("Entering ACPI S%d state.\n", state);
+
     local_irq_save(flags);
 
     if ( (error = device_power_down()) )
     {
-        pmprintk(XENLOG_ERR, "Some devices failed to power down.");
+        printk(XENLOG_ERR "Some devices failed to power down.");
         goto done;
     }
 
@@ -162,8 +167,6 @@ static int enter_state(u32 state)
         break;
     }
 
-    pmprintk(XENLOG_DEBUG, "Back to C.");
-
     /* Restore CR4 and EFER from cached values. */
     write_cr4(read_cr4());
     if ( cpu_has_efer )
@@ -171,16 +174,18 @@ static int enter_state(u32 state)
 
     device_power_up();
 
-    pmprintk(XENLOG_INFO, "Finishing wakeup from ACPI S%d state.", state);
+    printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.", state);
 
  done:
     local_irq_restore(flags);
+    console_end_sync();
     acpi_sleep_post(state);
     if ( !hvm_cpu_up() )
         BUG();
 
  enable_cpu:
     enable_nonboot_cpus();
+    cpufreq_resume();
     thaw_domains();
     spin_unlock(&pm_lock);
     return error;
@@ -206,7 +211,7 @@ int acpi_enter_sleep(struct xenpf_enter_
          ((sleep->pm1a_cnt_val ^ sleep->pm1b_cnt_val) &
           ACPI_BITMASK_SLEEP_ENABLE) )
     {
-        pmprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting.");
+        gdprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting.");
         return -EINVAL;
     }
 
@@ -278,7 +283,7 @@ acpi_status asmlinkage acpi_enter_sleep_
     if ( tboot_in_measured_env() )
     {
         tboot_sleep(sleep_state);
-        pmprintk(XENLOG_ERR, "TBOOT failed entering s3 state\n");
+        printk(XENLOG_ERR "TBOOT failed entering s3 state\n");
         return_ACPI_STATUS(AE_ERROR);
     }
 
@@ -320,7 +325,7 @@ static int __init acpi_sleep_init(void)
             p += strspn(p, ", \t");
     }
 
-    printk(XENLOG_INFO "<PM> ACPI (supports");
+    printk(XENLOG_INFO "ACPI sleep modes:");
     for ( i = 0; i < ACPI_S_STATE_COUNT; i++ )
     {
         if ( i == ACPI_STATE_S3 )
@@ -331,7 +336,7 @@ static int __init acpi_sleep_init(void)
         else
             sleep_states[i] = 0;
     }
-    printk(")\n");
+    printk("\n");
 
     return 0;
 }
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/emulate.c
--- a/xen/arch/x86/hvm/emulate.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/emulate.c        Wed Jul 02 11:30:37 2008 +0900
@@ -21,15 +21,33 @@
 
 static int hvmemul_do_io(
     int is_mmio, paddr_t addr, unsigned long *reps, int size,
-    paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
-{
+    paddr_t ram_gpa, int dir, int df, void *p_data)
+{
+    paddr_t value = ram_gpa;
+    int value_is_ptr = (p_data == NULL);
     struct vcpu *curr = current;
     vcpu_iodata_t *vio = get_ioreq(curr);
     ioreq_t *p = &vio->vp_ioreq;
     int rc;
 
-    /* Only retrieve the value from singleton (non-REP) reads. */
-    ASSERT((val == NULL) || ((dir == IOREQ_READ) && !value_is_ptr));
+    /*
+     * Weird-sized accesses have undefined behaviour: we discard writes
+     * and read all-ones.
+     */
+    if ( unlikely((size > sizeof(long)) || (size & (size - 1))) )
+    {
+        gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size);
+        ASSERT(p_data != NULL); /* cannot happen with a REP prefix */
+        if ( dir == IOREQ_READ )
+            memset(p_data, ~0, size);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    if ( (p_data != NULL) && (dir == IOREQ_WRITE) )
+    {
+        memcpy(&value, p_data, size);
+        p_data = NULL;
+    }
 
     if ( is_mmio && !value_is_ptr )
     {
@@ -47,8 +65,7 @@ static int hvmemul_do_io(
             unsigned int bytes = curr->arch.hvm_vcpu.mmio_large_read_bytes;
             if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) )
             {
-                *val = 0;
-                memcpy(val, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
+                memcpy(p_data, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
                        size);
                 return X86EMUL_OKAY;
             }
@@ -61,7 +78,7 @@ static int hvmemul_do_io(
         break;
     case HVMIO_completed:
         curr->arch.hvm_vcpu.io_state = HVMIO_none;
-        if ( val == NULL )
+        if ( p_data == NULL )
             return X86EMUL_UNHANDLEABLE;
         goto finish_access;
     case HVMIO_dispatched:
@@ -82,7 +99,7 @@ static int hvmemul_do_io(
     }
 
     curr->arch.hvm_vcpu.io_state =
-        (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
+        (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
 
     p->dir = dir;
     p->data_is_ptr = value_is_ptr;
@@ -116,7 +133,7 @@ static int hvmemul_do_io(
         break;
     case X86EMUL_UNHANDLEABLE:
         hvm_send_assist_req(curr);
-        rc = (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
+        rc = (p_data != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
         break;
     default:
         BUG();
@@ -126,8 +143,8 @@ static int hvmemul_do_io(
         return rc;
 
  finish_access:
-    if ( val != NULL )
-        *val = curr->arch.hvm_vcpu.io_data;
+    if ( p_data != NULL )
+        memcpy(p_data, &curr->arch.hvm_vcpu.io_data, size);
 
     if ( is_mmio && !value_is_ptr )
     {
@@ -152,7 +169,7 @@ static int hvmemul_do_io(
                   sizeof(curr->arch.hvm_vcpu.mmio_large_read)) )
             {
                 memcpy(&curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
-                       val, size);
+                       p_data, size);
                 curr->arch.hvm_vcpu.mmio_large_read_bytes += size;
             }
         }
@@ -163,18 +180,16 @@ static int hvmemul_do_io(
 
 static int hvmemul_do_pio(
     unsigned long port, unsigned long *reps, int size,
-    paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
-{
-    return hvmemul_do_io(0, port, reps, size, value,
-                         dir, df, value_is_ptr, val);
+    paddr_t ram_gpa, int dir, int df, void *p_data)
+{
+    return hvmemul_do_io(0, port, reps, size, ram_gpa, dir, df, p_data);
 }
 
 static int hvmemul_do_mmio(
     paddr_t gpa, unsigned long *reps, int size,
-    paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
-{
-    return hvmemul_do_io(1, gpa, reps, size, value,
-                         dir, df, value_is_ptr, val);
+    paddr_t ram_gpa, int dir, int df, void *p_data)
+{
+    return hvmemul_do_io(1, gpa, reps, size, ram_gpa, dir, df, p_data);
 }
 
 /*
@@ -287,7 +302,7 @@ static int __hvmemul_read(
 static int __hvmemul_read(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     enum hvm_access_type access_type,
     struct hvm_emulate_ctxt *hvmemul_ctxt)
@@ -302,8 +317,6 @@ static int __hvmemul_read(
         seg, offset, bytes, access_type, hvmemul_ctxt, &addr);
     if ( rc != X86EMUL_OKAY )
         return rc;
-
-    *val = 0;
 
     if ( unlikely(curr->arch.hvm_vcpu.mmio_gva == (addr & PAGE_MASK)) &&
          curr->arch.hvm_vcpu.mmio_gva )
@@ -314,7 +327,7 @@ static int __hvmemul_read(
         gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
         if ( (off + bytes) <= PAGE_SIZE )
             return hvmemul_do_mmio(gpa, &reps, bytes, 0,
-                                   IOREQ_READ, 0, 0, val);
+                                   IOREQ_READ, 0, p_data);
     }
 
     if ( (seg != x86_seg_none) &&
@@ -322,15 +335,13 @@ static int __hvmemul_read(
         pfec |= PFEC_user_mode;
 
     rc = ((access_type == hvm_access_insn_fetch) ?
-          hvm_fetch_from_guest_virt(val, addr, bytes, pfec) :
-          hvm_copy_from_guest_virt(val, addr, bytes, pfec));
+          hvm_fetch_from_guest_virt(p_data, addr, bytes, pfec) :
+          hvm_copy_from_guest_virt(p_data, addr, bytes, pfec));
     if ( rc == HVMCOPY_bad_gva_to_gfn )
         return X86EMUL_EXCEPTION;
 
     if ( rc == HVMCOPY_bad_gfn_to_mfn )
     {
-        unsigned long reps = 1;
-
         if ( access_type == hvm_access_insn_fetch )
             return X86EMUL_UNHANDLEABLE;
 
@@ -339,7 +350,7 @@ static int __hvmemul_read(
         if ( rc != X86EMUL_OKAY )
             return rc;
 
-        return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
+        return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, p_data);
     }
 
     return X86EMUL_OKAY;
@@ -348,19 +359,19 @@ static int hvmemul_read(
 static int hvmemul_read(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
     return __hvmemul_read(
-        seg, offset, val, bytes, hvm_access_read,
+        seg, offset, p_data, bytes, hvm_access_read,
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
 }
 
 static int hvmemul_insn_fetch(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
@@ -371,19 +382,18 @@ static int hvmemul_insn_fetch(
     /* Fall back if requested bytes are not in the prefetch cache. */
     if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) )
         return __hvmemul_read(
-            seg, offset, val, bytes,
+            seg, offset, p_data, bytes,
             hvm_access_insn_fetch, hvmemul_ctxt);
 
     /* Hit the cache. Simple memcpy. */
-    *val = 0;
-    memcpy(val, &hvmemul_ctxt->insn_buf[insn_off], bytes);
+    memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes);
     return X86EMUL_OKAY;
 }
 
 static int hvmemul_write(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
@@ -406,29 +416,27 @@ static int hvmemul_write(
         unsigned int off = addr & (PAGE_SIZE - 1);
         gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
         if ( (off + bytes) <= PAGE_SIZE )
-            return hvmemul_do_mmio(gpa, &reps, bytes, val,
-                                   IOREQ_WRITE, 0, 0, NULL);
+            return hvmemul_do_mmio(gpa, &reps, bytes, 0,
+                                   IOREQ_WRITE, 0, p_data);
     }
 
     if ( (seg != x86_seg_none) &&
          (hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) )
         pfec |= PFEC_user_mode;
 
-    rc = hvm_copy_to_guest_virt(addr, &val, bytes, pfec);
+    rc = hvm_copy_to_guest_virt(addr, p_data, bytes, pfec);
     if ( rc == HVMCOPY_bad_gva_to_gfn )
         return X86EMUL_EXCEPTION;
 
     if ( rc == HVMCOPY_bad_gfn_to_mfn )
     {
-        unsigned long reps = 1;
-
         rc = hvmemul_linear_to_phys(
             addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt);
         if ( rc != X86EMUL_OKAY )
             return rc;
 
-        return hvmemul_do_mmio(gpa, &reps, bytes, val,
-                               IOREQ_WRITE, 0, 0, NULL);
+        return hvmemul_do_mmio(gpa, &reps, bytes, 0,
+                               IOREQ_WRITE, 0, p_data);
     }
 
     return X86EMUL_OKAY;
@@ -442,12 +450,8 @@ static int hvmemul_cmpxchg(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    unsigned long new = 0;
-    if ( bytes > sizeof(new) )
-        return X86EMUL_UNHANDLEABLE;
-    memcpy(&new, p_new, bytes);
     /* Fix this in case the guest is really relying on r-m-w atomicity. */
-    return hvmemul_write(seg, offset, new, bytes, ctxt);
+    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
 }
 
 static int hvmemul_rep_ins(
@@ -480,7 +484,7 @@ static int hvmemul_rep_ins(
         return rc;
 
     return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ,
-                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
 }
 
 static int hvmemul_rep_outs(
@@ -513,7 +517,7 @@ static int hvmemul_rep_outs(
         return rc;
 
     return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE,
-                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
 }
 
 static int hvmemul_rep_movs(
@@ -563,14 +567,14 @@ static int hvmemul_rep_movs(
     if ( !p2m_is_ram(p2mt) )
         return hvmemul_do_mmio(
             sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ,
-            !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+            !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
 
     (void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt);
     if ( p2m_is_ram(p2mt) )
         return X86EMUL_UNHANDLEABLE;
     return hvmemul_do_mmio(
         dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE,
-        !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+        !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
 }
 
 static int hvmemul_read_segment(
@@ -607,7 +611,8 @@ static int hvmemul_read_io(
     struct x86_emulate_ctxt *ctxt)
 {
     unsigned long reps = 1;
-    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
+    *val = 0;
+    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, val);
 }
 
 static int hvmemul_write_io(
@@ -617,7 +622,7 @@ static int hvmemul_write_io(
     struct x86_emulate_ctxt *ctxt)
 {
     unsigned long reps = 1;
-    return hvmemul_do_pio(port, &reps, bytes, val, IOREQ_WRITE, 0, 0, NULL);
+    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_WRITE, 0, &val);
 }
 
 static int hvmemul_read_cr(
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/hvm.c    Wed Jul 02 11:30:37 2008 +0900
@@ -2529,6 +2529,66 @@ long do_hvm_op(unsigned long op, XEN_GUE
         break;
     }
 
+    case HVMOP_modified_memory:
+    {
+        struct xen_hvm_modified_memory a;
+        struct domain *d;
+        unsigned long pfn;
+
+        if ( copy_from_guest(&a, arg, 1) )
+            return -EFAULT;
+
+        if ( a.domid == DOMID_SELF )
+        {
+            d = rcu_lock_current_domain();
+        }
+        else
+        {
+            if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
+                return -ESRCH;
+            if ( !IS_PRIV_FOR(current->domain, d) )
+            {
+                rc = -EPERM;
+                goto param_fail3;
+            }
+        }
+
+        rc = -EINVAL;
+        if ( !is_hvm_domain(d) )
+            goto param_fail3;
+
+        rc = xsm_hvm_param(d, op);
+        if ( rc )
+            goto param_fail3;
+
+        rc = -EINVAL;
+        if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
+             ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
+             ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
+            goto param_fail3;
+
+        rc = 0;
+        if ( !paging_mode_log_dirty(d) )
+            goto param_fail3;
+
+        for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
+        {
+            p2m_type_t t;
+            mfn_t mfn = gfn_to_mfn(d, pfn, &t);
+            if ( mfn_x(mfn) != INVALID_MFN )
+            {
+                paging_mark_dirty(d, mfn_x(mfn));
+                /* These are most probably not page tables any more */
+                /* don't take a long time and don't die either */
+                sh_remove_shadows(d->vcpu[0], mfn, 1, 0);
+            }
+        }
+
+    param_fail3:
+        rcu_unlock_domain(d);
+        break;
+    }
+
     default:
     {
         gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Wed Jul 02 11:30:37 2008 +0900
@@ -677,10 +677,11 @@ static int construct_vmcs(struct vcpu *v
     return 0;
 }
 
-int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val)
-{
-    unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
-    const struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
+int vmx_read_guest_msr(u32 msr, u64 *val)
+{
+    struct vcpu *curr = current;
+    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
+    const struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
 
     for ( i = 0; i < msr_count; i++ )
     {
@@ -694,10 +695,11 @@ int vmx_read_guest_msr(struct vcpu *v, u
     return -ESRCH;
 }
 
-int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val)
-{
-    unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
-    struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
+int vmx_write_guest_msr(u32 msr, u64 val)
+{
+    struct vcpu *curr = current;
+    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
+    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
 
     for ( i = 0; i < msr_count; i++ )
     {
@@ -711,10 +713,20 @@ int vmx_write_guest_msr(struct vcpu *v, 
     return -ESRCH;
 }
 
-int vmx_add_guest_msr(struct vcpu *v, u32 msr)
-{
-    unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
-    struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
+int vmx_add_guest_msr(u32 msr)
+{
+    struct vcpu *curr = current;
+    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
+    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
+
+    if ( msr_area == NULL )
+    {
+        if ( (msr_area = alloc_xenheap_page()) == NULL )
+            return -ENOMEM;
+        curr->arch.hvm_vmx.msr_area = msr_area;
+        __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
+        __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
+    }
 
     for ( i = 0; i < msr_count; i++ )
         if ( msr_area[i].index == msr )
@@ -723,29 +735,29 @@ int vmx_add_guest_msr(struct vcpu *v, u3
     if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
         return -ENOSPC;
 
-    if ( msr_area == NULL )
-    {
-        if ( (msr_area = alloc_xenheap_page()) == NULL )
-            return -ENOMEM;
-        v->arch.hvm_vmx.msr_area = msr_area;
-        __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
-        __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
-    }
-
     msr_area[msr_count].index = msr;
     msr_area[msr_count].mbz   = 0;
     msr_area[msr_count].data  = 0;
-    v->arch.hvm_vmx.msr_count = ++msr_count;
+    curr->arch.hvm_vmx.msr_count = ++msr_count;
     __vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count);
     __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count);
 
     return 0;
 }
 
-int vmx_add_host_load_msr(struct vcpu *v, u32 msr)
-{
-    unsigned int i, msr_count = v->arch.hvm_vmx.host_msr_count;
-    struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.host_msr_area;
+int vmx_add_host_load_msr(u32 msr)
+{
+    struct vcpu *curr = current;
+    unsigned int i, msr_count = curr->arch.hvm_vmx.host_msr_count;
+    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.host_msr_area;
+
+    if ( msr_area == NULL )
+    {
+        if ( (msr_area = alloc_xenheap_page()) == NULL )
+            return -ENOMEM;
+        curr->arch.hvm_vmx.host_msr_area = msr_area;
+        __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
+    }
 
     for ( i = 0; i < msr_count; i++ )
         if ( msr_area[i].index == msr )
@@ -754,18 +766,10 @@ int vmx_add_host_load_msr(struct vcpu *v
     if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
         return -ENOSPC;
 
-    if ( msr_area == NULL )
-    {
-        if ( (msr_area = alloc_xenheap_page()) == NULL )
-            return -ENOMEM;
-        v->arch.hvm_vmx.host_msr_area = msr_area;
-        __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
-    }
-
     msr_area[msr_count].index = msr;
     msr_area[msr_count].mbz   = 0;
     rdmsrl(msr, msr_area[msr_count].data);
-    v->arch.hvm_vmx.host_msr_count = ++msr_count;
+    curr->arch.hvm_vmx.host_msr_count = ++msr_count;
     __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count);
 
     return 0;
@@ -776,21 +780,17 @@ int vmx_create_vmcs(struct vcpu *v)
     struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
     int rc;
 
-    if ( arch_vmx->vmcs == NULL )
-    {
-        if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
-            return -ENOMEM;
-
-        INIT_LIST_HEAD(&arch_vmx->active_list);
-        __vmpclear(virt_to_maddr(arch_vmx->vmcs));
-        arch_vmx->active_cpu = -1;
-        arch_vmx->launched   = 0;
-    }
+    if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
+        return -ENOMEM;
+
+    INIT_LIST_HEAD(&arch_vmx->active_list);
+    __vmpclear(virt_to_maddr(arch_vmx->vmcs));
+    arch_vmx->active_cpu = -1;
+    arch_vmx->launched   = 0;
 
     if ( (rc = construct_vmcs(v)) != 0 )
     {
         vmx_free_vmcs(arch_vmx->vmcs);
-        arch_vmx->vmcs = NULL;
         return rc;
     }
 
@@ -801,13 +801,13 @@ void vmx_destroy_vmcs(struct vcpu *v)
 {
     struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
 
-    if ( arch_vmx->vmcs == NULL )
-        return;
-
     vmx_clear_vmcs(v);
 
     vmx_free_vmcs(arch_vmx->vmcs);
-    arch_vmx->vmcs = NULL;
+
+    free_xenheap_page(v->arch.hvm_vmx.host_msr_area);
+    free_xenheap_page(v->arch.hvm_vmx.msr_area);
+    free_xenheap_page(v->arch.hvm_vmx.msr_bitmap);
 }
 
 void vm_launch_fail(void)
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Jul 02 11:30:37 2008 +0900
@@ -1523,7 +1523,8 @@ static int vmx_cr_access(unsigned long e
         break;
     case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
         value = v->arch.hvm_vcpu.guest_cr[0];
-        value = (value & ~0xFFFF) | ((exit_qualification >> 16) & 0xFFFF);
+        /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
+        value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
         HVMTRACE_LONG_1D(LMSW, current, value);
         return !hvm_set_cr0(value);
     default:
@@ -1655,7 +1656,7 @@ static int vmx_msr_read_intercept(struct
                 goto done;
         }
 
-        if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
+        if ( vmx_read_guest_msr(ecx, &msr_content) == 0 )
             break;
 
         if ( is_last_branch_msr(ecx) )
@@ -1817,12 +1818,12 @@ static int vmx_msr_write_intercept(struc
 
             for ( ; (rc == 0) && lbr->count; lbr++ )
                 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
-                    if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
+                    if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
                         vmx_disable_intercept_for_msr(v, lbr->base + i);
         }
 
         if ( (rc < 0) ||
-             (vmx_add_host_load_msr(v, ecx) < 0) )
+             (vmx_add_host_load_msr(ecx) < 0) )
             vmx_inject_hw_exception(v, TRAP_machine_check, 0);
         else
         {
@@ -1842,7 +1843,7 @@ static int vmx_msr_write_intercept(struc
         switch ( long_mode_do_msr_write(regs) )
         {
             case HNDL_unhandled:
-                if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
+                if ( (vmx_write_guest_msr(ecx, msr_content) != 0) &&
                      !is_last_branch_msr(ecx) )
                     wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
                 break;
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vpmu_core2.c
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c Wed Jul 02 11:30:37 2008 +0900
@@ -219,12 +219,12 @@ static int core2_vpmu_alloc_resource(str
         return 0;
 
     wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-    if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
-        return 0;
-
-    if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
-        return 0;
-    vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, -1ULL);
+    if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+        return 0;
+
+    if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+        return 0;
+    vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, -1ULL);
 
     pmu_enable = xmalloc_bytes(sizeof(struct core2_pmu_enable) +
                  (core2_get_pmc_count()-1)*sizeof(char));
@@ -347,7 +347,7 @@ static int core2_vpmu_do_wrmsr(struct cp
         break;
     case MSR_CORE_PERF_FIXED_CTR_CTRL:
         non_global_ctrl = msr_content;
-        vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
+        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
         global_ctrl >>= 32;
         for ( i = 0; i < 3; i++ )
         {
@@ -359,7 +359,7 @@ static int core2_vpmu_do_wrmsr(struct cp
         break;
     default:
         tmp = ecx - MSR_P6_EVNTSEL0;
-        vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
+        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
         if ( tmp >= 0 && tmp < core2_get_pmc_count() )
             core2_vpmu_cxt->pmu_enable->arch_pmc_enable[tmp] =
                 (global_ctrl >> tmp) & (msr_content >> 22) & 1;
@@ -385,7 +385,7 @@ static int core2_vpmu_do_wrmsr(struct cp
     if ( type != MSR_TYPE_GLOBAL )
         wrmsrl(ecx, msr_content);
     else
-        vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+        vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
 
     return 1;
 }
@@ -410,7 +410,7 @@ static int core2_vpmu_do_rdmsr(struct cp
         msr_content = core2_vpmu_cxt->global_ovf_status;
         break;
     case MSR_CORE_PERF_GLOBAL_CTRL:
-        vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &msr_content);
+        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &msr_content);
         break;
     default:
         rdmsrl(regs->ecx, msr_content);
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm.c Wed Jul 02 11:30:37 2008 +0900
@@ -219,7 +219,7 @@ void __init arch_init_memory(void)
      * Any Xen-heap pages that we will allow to be mapped will have
      * their domain field set to dom_xen.
      */
-    dom_xen = alloc_domain(DOMID_XEN);
+    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
     BUG_ON(dom_xen == NULL);
 
     /*
@@ -227,7 +227,7 @@ void __init arch_init_memory(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = alloc_domain(DOMID_IO);
+    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
     BUG_ON(dom_io == NULL);
 
     /* First 1MB of RAM is historically marked as I/O. */
@@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page
         {
             struct domain *d = page_get_owner(page);
 
-            /* Never allow a shadowed frame to go from type count 0 to 1 */
-            if ( d && shadow_mode_enabled(d) )
-                shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
+            /* Normally we should never let a page go from type count 0
+             * to type count 1 when it is shadowed. One exception:
+             * out-of-sync shadowed pages are allowed to become
+             * writeable. */
+            if ( d && shadow_mode_enabled(d)
+                 && (page->count_info & PGC_page_table)
+                 && !((page->shadow_flags & (1u<<29))
+                      && type == PGT_writable_page) )
+               shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
 
             ASSERT(!(x & PGT_pae_xen_l2));
             if ( (x & PGT_type_mask) != type )
@@ -3533,15 +3539,14 @@ static int ptwr_emulated_read(
 static int ptwr_emulated_read(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
     unsigned int rc;
     unsigned long addr = offset;
 
-    *val = 0;
-    if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
+    if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
     {
         propagate_page_fault(addr + bytes - rc, 0); /* read fault */
         return X86EMUL_EXCEPTION;
@@ -3568,7 +3573,7 @@ static int ptwr_emulated_update(
     /* Only allow naturally-aligned stores within the original %cr2 page. */
     if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
     {
-        MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
+        MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
                 ptwr_ctxt->cr2, addr, bytes);
         return X86EMUL_UNHANDLEABLE;
     }
@@ -3676,10 +3681,21 @@ static int ptwr_emulated_write(
 static int ptwr_emulated_write(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
+    paddr_t val = 0;
+
+    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
+    {
+        MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
+                offset, bytes);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    memcpy(&val, p_data, bytes);
+
     return ptwr_emulated_update(
         offset, 0, val, bytes, 0,
         container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
@@ -3694,10 +3710,17 @@ static int ptwr_emulated_cmpxchg(
     struct x86_emulate_ctxt *ctxt)
 {
     paddr_t old = 0, new = 0;
-    if ( bytes > sizeof(paddr_t) )
+
+    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
+    {
+        MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
+                offset, bytes);
         return X86EMUL_UNHANDLEABLE;
+    }
+
     memcpy(&old, p_old, bytes);
     memcpy(&new, p_new, bytes);
+
     return ptwr_emulated_update(
         offset, old, new, bytes, 1,
         container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/common.c   Wed Jul 02 11:30:37 2008 +0900
@@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d
     /* Use shadow pagetables for log-dirty support */
     paging_log_dirty_init(d, shadow_enable_log_dirty, 
                           shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    d->arch.paging.shadow.oos_active = 0;
+#endif
 }
 
 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
@@ -64,6 +68,16 @@ void shadow_domain_init(struct domain *d
  */
 void shadow_vcpu_init(struct vcpu *v)
 {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    int i;
+
+    for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
+    {
+        v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
+        v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
+    }
+#endif
+
     v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
 }
 
@@ -131,7 +145,7 @@ static int
 static int
 hvm_read(enum x86_segment seg,
          unsigned long offset,
-         unsigned long *val,
+         void *p_data,
          unsigned int bytes,
          enum hvm_access_type access_type,
          struct sh_emulate_ctxt *sh_ctxt)
@@ -144,12 +158,10 @@ hvm_read(enum x86_segment seg,
     if ( rc )
         return rc;
 
-    *val = 0;
-
     if ( access_type == hvm_access_insn_fetch )
-        rc = hvm_fetch_from_guest_virt(val, addr, bytes, 0);
+        rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0);
     else
-        rc = hvm_copy_from_guest_virt(val, addr, bytes, 0);
+        rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0);
 
     switch ( rc )
     {
@@ -167,20 +179,20 @@ static int
 static int
 hvm_emulate_read(enum x86_segment seg,
                  unsigned long offset,
-                 unsigned long *val,
+                 void *p_data,
                  unsigned int bytes,
                  struct x86_emulate_ctxt *ctxt)
 {
     if ( !is_x86_user_segment(seg) )
         return X86EMUL_UNHANDLEABLE;
-    return hvm_read(seg, offset, val, bytes, hvm_access_read,
+    return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
                     container_of(ctxt, struct sh_emulate_ctxt, ctxt));
 }
 
 static int
 hvm_emulate_insn_fetch(enum x86_segment seg,
                        unsigned long offset,
-                       unsigned long *val,
+                       void *p_data,
                        unsigned int bytes,
                        struct x86_emulate_ctxt *ctxt)
 {
@@ -192,19 +204,18 @@ hvm_emulate_insn_fetch(enum x86_segment 
 
     /* Fall back if requested bytes are not in the prefetch cache. */
     if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
-        return hvm_read(seg, offset, val, bytes,
+        return hvm_read(seg, offset, p_data, bytes,
                         hvm_access_insn_fetch, sh_ctxt);
 
     /* Hit the cache. Simple memcpy. */
-    *val = 0;
-    memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
+    memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
     return X86EMUL_OKAY;
 }
 
 static int
 hvm_emulate_write(enum x86_segment seg,
                   unsigned long offset,
-                  unsigned long val,
+                  void *p_data,
                   unsigned int bytes,
                   struct x86_emulate_ctxt *ctxt)
 {
@@ -227,7 +238,7 @@ hvm_emulate_write(enum x86_segment seg,
         return rc;
 
     return v->arch.paging.mode->shadow.x86_emulate_write(
-        v, addr, &val, bytes, sh_ctxt);
+        v, addr, p_data, bytes, sh_ctxt);
 }
 
 static int 
@@ -279,7 +290,7 @@ static int
 static int
 pv_emulate_read(enum x86_segment seg,
                 unsigned long offset,
-                unsigned long *val,
+                void *p_data,
                 unsigned int bytes,
                 struct x86_emulate_ctxt *ctxt)
 {
@@ -288,8 +299,7 @@ pv_emulate_read(enum x86_segment seg,
     if ( !is_x86_user_segment(seg) )
         return X86EMUL_UNHANDLEABLE;
 
-    *val = 0;
-    if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
+    if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 )
     {
         propagate_page_fault(offset + bytes - rc, 0); /* read fault */
         return X86EMUL_EXCEPTION;
@@ -301,7 +311,7 @@ static int
 static int
 pv_emulate_write(enum x86_segment seg,
                  unsigned long offset,
-                 unsigned long val,
+                 void *p_data,
                  unsigned int bytes,
                  struct x86_emulate_ctxt *ctxt)
 {
@@ -311,7 +321,7 @@ pv_emulate_write(enum x86_segment seg,
     if ( !is_x86_user_segment(seg) )
         return X86EMUL_UNHANDLEABLE;
     return v->arch.paging.mode->shadow.x86_emulate_write(
-        v, offset, &val, bytes, sh_ctxt);
+        v, offset, p_data, bytes, sh_ctxt);
 }
 
 static int 
@@ -427,6 +437,585 @@ void shadow_continue_emulation(struct sh
         }
     }
 }
+ 
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Out-of-sync shadows. */ 
+
+/* From time to time, we let a shadowed pagetable page go out of sync 
+ * with its shadow: the guest is allowed to write directly to the page, 
+ * and those writes are not synchronously reflected in the shadow.
+ * This lets us avoid many emulations if the guest is writing a lot to a 
+ * pagetable, but it relaxes a pretty important invariant in the shadow 
+ * pagetable design.  Therefore, some rules:
+ *
+ * 1. Only L1 pagetables may go out of sync: any page that is shadowed
+ *    at at higher level must be synchronously updated.  This makes
+ *    using linear shadow pagetables much less dangerous.
+ *    That means that: (a) unsyncing code needs to check for higher-level
+ *    shadows, and (b) promotion code needs to resync.
+ * 
+ * 2. All shadow operations on a guest page require the page to be brought
+ *    back into sync before proceeding.  This must be done under the
+ *    shadow lock so that the page is guaranteed to remain synced until
+ *    the operation completes.
+ *
+ *    Exceptions to this rule: the pagefault and invlpg handlers may 
+ *    update only one entry on an out-of-sync page without resyncing it. 
+ *
+ * 3. Operations on shadows that do not start from a guest page need to
+ *    be aware that they may be handling an out-of-sync shadow.
+ *
+ * 4. Operations that do not normally take the shadow lock (fast-path 
+ *    #PF handler, INVLPG) must fall back to a locking, syncing version 
+ *    if they see an out-of-sync table. 
+ *
+ * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
+ *    must explicitly resync all relevant pages or update their
+ *    shadows.
+ *
+ * Currently out-of-sync pages are listed in a simple open-addressed
+ * hash table with a second chance (must resist temptation to radically
+ * over-engineer hash tables...)  The virtual address of the access
+ * which caused us to unsync the page is also kept in the hash table, as
+ * a hint for finding the writable mappings later.
+ *
+ * We keep a hash per vcpu, because we want as much as possible to do
+ * the re-sync on the save vcpu we did the unsync on, so the VA hint
+ * will be valid.
+ */
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
+static void sh_oos_audit(struct domain *d) 
+{
+    int idx, expected_idx, expected_idx_alt;
+    struct page_info *pg;
+    struct vcpu *v;
+    
+    for_each_vcpu(d, v) 
+    {
+        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+        {
+            mfn_t *oos = v->arch.paging.shadow.oos;
+            if ( !mfn_valid(oos[idx]) )
+                continue;
+            
+            expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
+            expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
+            if ( idx != expected_idx && idx != expected_idx_alt )
+            {
+                printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
+                       __func__, idx, mfn_x(oos[idx]), 
+                       expected_idx, expected_idx_alt);
+                BUG();
+            }
+            pg = mfn_to_page(oos[idx]);
+            if ( !(pg->count_info & PGC_page_table) )
+            {
+                printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->count_info);
+                BUG();
+            }
+            if ( !(pg->shadow_flags & SHF_out_of_sync) )
+            {
+                printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+                BUG();
+            }
+            if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
+            {
+                printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+                BUG();
+            }
+        }
+    }
+}
+#endif
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) 
+{
+    int idx;
+    struct vcpu *v;
+    mfn_t *oos;
+
+    ASSERT(mfn_is_out_of_sync(gmfn));
+    
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+            return;
+    }
+
+    SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+#endif
+
+/* Update the shadow, but keep the page out of sync. */
+static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
+{
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(mfn_valid(gmfn));
+    ASSERT(page_is_out_of_sync(pg));
+
+    /* Call out to the appropriate per-mode resyncing function */
+    if ( pg->shadow_flags & SHF_L1_32 )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
+    else if ( pg->shadow_flags & SHF_L1_PAE )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( pg->shadow_flags & SHF_L1_64 )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
+#endif
+}
+
+#define _FIXUP_IDX(_b, _i) ((_b) * SHADOW_OOS_FT_HASH + (_i))
+
+void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
+                   mfn_t smfn, unsigned long off)
+{
+    int idx, i, free = 0, free_slot = 0;
+    struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
+    for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
+    {
+        if ( !mfn_valid(fixups[_FIXUP_IDX(idx, i)].gmfn)
+             || !mfn_is_out_of_sync(fixups[_FIXUP_IDX(idx, i)].gmfn) )
+        {
+            free = 1;
+            free_slot = _FIXUP_IDX(idx, i);
+        }
+        else if ( (mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn))
+                  && (mfn_x(fixups[_FIXUP_IDX(idx, i)].smfn) == mfn_x(smfn))
+                  && (fixups[_FIXUP_IDX(idx, i)].off == off) )
+        {
+            perfc_incr(shadow_oos_fixup_no_add);
+            return;
+        }
+    }
+
+    if ( free )
+    {
+        if ( !v->arch.paging.shadow.oos_fixup_used )
+            v->arch.paging.shadow.oos_fixup_used = 1;
+        fixups[free_slot].gmfn = gmfn;
+        fixups[free_slot].smfn = smfn;
+        fixups[free_slot].off = off;
+        perfc_incr(shadow_oos_fixup_add_ok);
+        return;
+    }
+
+
+    perfc_incr(shadow_oos_fixup_add_fail);
+}
+
+void oos_fixup_remove(struct vcpu *v, mfn_t gmfn)
+{
+    int idx, i;
+    struct domain *d = v->domain;
+
+    perfc_incr(shadow_oos_fixup_remove);
+
+    /* If the domain is dying we might get called when deallocating
+     * the shadows. Fixup tables are already freed so exit now. */
+    if ( d->is_dying )
+        return;
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
+    for_each_vcpu(d, v)
+    {
+        struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+        for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
+            if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn) )
+                fixups[_FIXUP_IDX(idx, i)].gmfn = _mfn(INVALID_MFN);
+    }
+}
+
+int oos_fixup_flush(struct vcpu *v)
+{
+    int i, rc = 0;
+    struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+
+    perfc_incr(shadow_oos_fixup_flush);
+
+    if ( !v->arch.paging.shadow.oos_fixup_used )
+        return 0;
+
+    for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
+    {
+        if ( mfn_valid(fixups[i].gmfn) )
+        {
+            if ( mfn_is_out_of_sync(fixups[i].gmfn) )
+                rc |= sh_remove_write_access_from_sl1p(v, fixups[i].gmfn,
+                                                       fixups[i].smfn,
+                                                       fixups[i].off);
+            fixups[i].gmfn = _mfn(INVALID_MFN);
+        }
+    }
+
+    v->arch.paging.shadow.oos_fixup_used = 0;
+
+    return rc;
+}
+
+int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn)
+{
+    int idx, i, rc = 0;
+    struct domain *d = v->domain;
+
+    perfc_incr(shadow_oos_fixup_flush_gmfn);
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
+    for_each_vcpu(d, v)
+    {
+        struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+
+        for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
+        {
+            if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) != mfn_x(gmfn) )
+                continue;
+
+            rc |= sh_remove_write_access_from_sl1p(v, 
+                                                   
fixups[_FIXUP_IDX(idx,i)].gmfn,
+                                                   
fixups[_FIXUP_IDX(idx,i)].smfn,
+                                                   
fixups[_FIXUP_IDX(idx,i)].off);
+
+            fixups[_FIXUP_IDX(idx,i)].gmfn = _mfn(INVALID_MFN);
+        }
+    }
+
+    return rc;
+}
+
+static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned long 
va)
+{
+    int ftlb = 0;
+
+    ftlb |= oos_fixup_flush_gmfn(v, gmfn);
+
+    switch ( sh_remove_write_access(v, gmfn, 0, va) )
+    {
+    default:
+    case 0:
+        break;
+
+    case 1:
+        ftlb |= 1;
+        break;
+
+    case -1:
+        /* An unfindable writeable typecount has appeared, probably via a
+         * grant table entry: can't shoot the mapping, so try to unshadow 
+         * the page.  If that doesn't work either, the guest is granting
+         * his pagetables and must be killed after all.
+         * This will flush the tlb, so we can return with no worries. */
+        sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
+        return 1;
+    }
+
+    if ( ftlb )
+        flush_tlb_mask(v->domain->domain_dirty_cpumask);
+
+    return 0;
+}
+
+
+/* Pull all the entries on an out-of-sync page back into sync. */
+static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va, mfn_t snp)
+{
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(shadow_locked_by_me(v->domain));
+    ASSERT(mfn_is_out_of_sync(gmfn));
+    /* Guest page must be shadowed *only* as L1 when out of sync. */
+    ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask 
+             & ~SHF_L1_ANY));
+    ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
+
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+    /* Need to pull write access so the page *stays* in sync. */
+    if ( oos_remove_write_access(v, gmfn, va) )
+    {
+        /* Page has been unshadowed. */
+        return;
+    }
+
+    /* No more writable mappings of this page, please */
+    pg->shadow_flags &= ~SHF_oos_may_write;
+
+    /* Update the shadows with current guest entries. */
+    _sh_resync_l1(v, gmfn, snp);
+
+    /* Now we know all the entries are synced, and will stay that way */
+    pg->shadow_flags &= ~SHF_out_of_sync;
+    perfc_incr(shadow_resync);
+}
+
+
+/* Add an MFN to the list of out-of-sync guest pagetables */
+static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    int idx, oidx, swap = 0;
+    void *gptr, *gsnpptr;
+    mfn_t *oos = v->arch.paging.shadow.oos;
+    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+    mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+    oidx = idx;
+
+    if ( mfn_valid(oos[idx]) 
+         && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
+    {
+        /* Punt the current occupant into the next slot */
+        SWAP(oos[idx], gmfn);
+        SWAP(oos_va[idx], va);
+        swap = 1;
+        idx = (idx + 1) % SHADOW_OOS_PAGES;
+    }
+    if ( mfn_valid(oos[idx]) )
+   {
+        /* Crush the current occupant. */
+        _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
+        perfc_incr(shadow_unsync_evict);
+    }
+    oos[idx] = gmfn;
+    oos_va[idx] = va;
+
+    if ( swap )
+        SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
+
+    gptr = sh_map_domain_page(oos[oidx]);
+    gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
+    memcpy(gsnpptr, gptr, PAGE_SIZE);
+    sh_unmap_domain_page(gptr);
+    sh_unmap_domain_page(gsnpptr);
+}
+
+/* Remove an MFN from the list of out-of-sync guest pagetables */
+static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    struct domain *d = v->domain;
+
+    SHADOW_PRINTK("D%dV%d gmfn %lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); 
+
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            oos[idx] = _mfn(INVALID_MFN);
+            return;
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+
+mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    mfn_t *oos_snapshot;
+    struct domain *d = v->domain;
+    
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            return oos_snapshot[idx];
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+    return _mfn(INVALID_MFN);
+}
+
+/* Pull a single guest page back into sync */
+void sh_resync(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    unsigned long *oos_va;
+    mfn_t *oos_snapshot;
+    struct domain *d = v->domain;
+
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        oos_va = v->arch.paging.shadow.oos_va;
+        oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            _sh_resync(v, gmfn, oos_va[idx], oos_snapshot[idx]);
+            oos[idx] = _mfn(INVALID_MFN);
+            return;
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table,
+ * by making a call out to the mode in which that shadow was made. */
+static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+    struct page_info *pg = mfn_to_page(gl1mfn);
+    if ( pg->shadow_flags & SHF_L1_32 )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
+    else if ( pg->shadow_flags & SHF_L1_PAE )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( pg->shadow_flags & SHF_L1_64 )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
+#endif
+    SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n", 
+                 mfn_x(gl1mfn));
+    BUG();
+    return 0; /* BUG() is no longer __attribute__((noreturn)). */
+}
+
+
+/* Pull all out-of-sync pages back into sync.  Pages brought out of sync
+ * on other vcpus are allowed to remain out of sync, but their contents
+ * will be made safe (TLB flush semantics); pages unsynced by this vcpu
+ * are brought back into sync and write-protected.  If skip != 0, we try
+ * to avoid resyncing at all if we think we can get away with it. */
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int 
do_locking)
+{
+    int idx;
+    struct vcpu *other;
+    mfn_t *oos = v->arch.paging.shadow.oos;
+    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+    mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+
+    SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
+
+    ASSERT(do_locking || shadow_locked_by_me(v->domain));
+
+    if ( !this )
+        goto resync_others;
+
+    if ( do_locking )
+        shadow_lock(v->domain);
+
+    if ( oos_fixup_flush(v) )
+        flush_tlb_mask(v->domain->domain_dirty_cpumask);    
+
+    /* First: resync all of this vcpu's oos pages */
+    for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+        if ( mfn_valid(oos[idx]) )
+        {
+            /* Write-protect and sync contents */
+            _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
+            oos[idx] = _mfn(INVALID_MFN);
+        }
+
+    if ( do_locking )
+        shadow_unlock(v->domain);
+
+ resync_others:
+    if ( !others )
+        return;
+
+    /* Second: make all *other* vcpus' oos pages safe. */
+    for_each_vcpu(v->domain, other)
+    {
+        if ( v == other ) 
+            continue;
+
+        if ( do_locking )
+            shadow_lock(v->domain);
+
+        oos = other->arch.paging.shadow.oos;
+        oos_va = other->arch.paging.shadow.oos_va;
+        oos_snapshot = other->arch.paging.shadow.oos_snapshot;
+        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+        {
+            if ( !mfn_valid(oos[idx]) )
+                continue;
+
+            if ( skip )
+            {
+                /* Update the shadows and leave the page OOS. */
+                if ( sh_skip_sync(v, oos[idx]) )
+                    continue;
+                _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
+            }
+            else
+            {
+                /* Write-protect and sync contents */
+                _sh_resync(other, oos[idx], oos_va[idx], oos_snapshot[idx]);
+                oos[idx] = _mfn(INVALID_MFN);
+            }
+        }
+        
+        if ( do_locking )
+            shadow_unlock(v->domain);
+    }
+}
+
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    struct page_info *pg;
+    
+    ASSERT(shadow_locked_by_me(v->domain));
+
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+    pg = mfn_to_page(gmfn);
+ 
+    /* Guest page must be shadowed *only* as L1 and *only* once when out
+     * of sync.  Also, get out now if it's already out of sync. 
+     * Also, can't safely unsync if some vcpus have paging disabled.*/
+    if ( pg->shadow_flags & 
+         ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) 
+         || sh_page_has_multiple_shadows(pg)
+         || !is_hvm_domain(v->domain)
+         || !v->domain->arch.paging.shadow.oos_active )
+        return 0;
+
+    pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
+    oos_hash_add(v, gmfn, va);
+    perfc_incr(shadow_unsync);
+    return 1;
+}
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
 
 /**************************************************************************/
 /* Code for "promoting" a guest page to the point where the shadow code is
@@ -440,6 +1029,12 @@ void shadow_promote(struct vcpu *v, mfn_
 
     ASSERT(mfn_valid(gmfn));
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Is the page already shadowed and out of sync? */
+    if ( page_is_out_of_sync(page) ) 
+        sh_resync(v, gmfn);
+#endif
+
     /* We should never try to promote a gmfn that has writeable mappings */
     ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
            || (page->u.inuse.type_info & PGT_count_mask) == 0
@@ -463,7 +1058,17 @@ void shadow_demote(struct vcpu *v, mfn_t
     clear_bit(type, &page->shadow_flags);
 
     if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
+    {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        /* Was the page out of sync? */
+        if ( page_is_out_of_sync(page) ) 
+        {
+            oos_hash_remove(v, gmfn);
+            oos_fixup_remove(v, gmfn);
+        }
+#endif 
         clear_bit(_PGC_page_table, &page->count_info);
+    }
 }
 
 /**************************************************************************/
@@ -674,7 +1279,8 @@ shadow_order(unsigned int shadow_type)
         0, /* SH_type_l3_64_shadow   */
         0, /* SH_type_l4_64_shadow   */
         2, /* SH_type_p2m_table      */
-        0  /* SH_type_monitor_table  */
+        0, /* SH_type_monitor_table  */
+        0  /* SH_type_oos_snapshot   */
         };
     ASSERT(shadow_type < SH_type_unused);
     return type_to_order[shadow_type];
@@ -1220,6 +1826,14 @@ static unsigned int sh_set_allocation(st
             sp = list_entry(d->arch.paging.shadow.freelists[order].next,
                             struct shadow_page_info, list);
             list_del(&sp->list);
+#if defined(__x86_64__)
+            /*
+             * Re-instate lock field which we overwrite with shadow_page_info.
+             * This was safe, since the lock is only used on guest pages.
+             */
+            for ( j = 0; j < 1U << order; j++ )
+                spin_lock_init(&((struct page_info *)sp)[j].lock);
+#endif
             d->arch.paging.shadow.free_pages -= 1 << order;
             d->arch.paging.shadow.total_pages -= 1 << order;
             free_domheap_pages((struct page_info *)sp, order);
@@ -1297,6 +1911,27 @@ static void sh_hash_audit_bucket(struct 
             /* Bad shadow flags on guest page? */
             BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
             /* Bad type count on guest page? */
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+            if ( sp->type == SH_type_l1_32_shadow
+                 || sp->type == SH_type_l1_pae_shadow
+                 || sp->type == SH_type_l1_64_shadow )
+            {
+                if ( (gpg->u.inuse.type_info & PGT_type_mask) == 
PGT_writable_page
+                     && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
+                {
+                    if ( !page_is_out_of_sync(gpg) )
+                    {
+                        SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
+                                     " and not OOS but has typecount %#lx\n",
+                                     sp->backpointer, 
+                                     mfn_x(shadow_page_to_mfn(sp)), 
+                                     gpg->u.inuse.type_info);
+                        BUG();
+                    }
+                }
+            }
+            else /* Not an l1 */
+#endif
             if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 
                  && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
             {
@@ -1608,7 +2243,8 @@ void sh_destroy_shadow(struct vcpu *v, m
 /* Remove all writeable mappings of a guest frame from the shadow tables 
  * Returns non-zero if we need to flush TLBs. 
  * level and fault_addr desribe how we found this to be a pagetable;
- * level==0 means we have some other reason for revoking write access.*/
+ * level==0 means we have some other reason for revoking write access.
+ * If level==0 we are allowed to fail, returning -1. */
 
 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, 
                            unsigned int level,
@@ -1659,7 +2295,12 @@ int sh_remove_write_access(struct vcpu *
         return 0;
 
     /* Early exit if it's already a pagetable, or otherwise not writeable */
-    if ( sh_mfn_is_a_page_table(gmfn) 
+    if ( (sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+         /* Unless they've been allowed to go out of sync with their shadows */
+           && !mfn_oos_may_write(gmfn)
+#endif
+         )
          || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
         return 0;
 
@@ -1676,7 +2317,7 @@ int sh_remove_write_access(struct vcpu *
     }
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
-    if ( v == current && level != 0 )
+    if ( v == current )
     {
         unsigned long gfn;
         /* Heuristic: there is likely to be only one writeable mapping,
@@ -1690,6 +2331,8 @@ int sh_remove_write_access(struct vcpu *
                 return 1;                                                 \
         } while (0)
 
+        if ( level == 0 && fault_addr )
+            GUESS(fault_addr, 6);
         
         if ( v->arch.paging.mode->guest_levels == 2 )
         {
@@ -1773,13 +2416,19 @@ int sh_remove_write_access(struct vcpu *
 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
     
     /* Brute-force search of all the shadows, by walking the hash */
-    perfc_incr(shadow_writeable_bf);
+    if ( level == 0 )
+        perfc_incr(shadow_writeable_bf_1);
+    else
+        perfc_incr(shadow_writeable_bf);
     hash_foreach(v, callback_mask, callbacks, gmfn);
 
     /* If that didn't catch the mapping, then there's some non-pagetable
      * mapping -- ioreq page, grant mapping, &c. */
     if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
     {
+        if ( level == 0 )
+            return -1;
+
         SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
                       "%lu special-use mappings of it\n", mfn_x(gmfn),
                       (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
@@ -1790,7 +2439,34 @@ int sh_remove_write_access(struct vcpu *
     return 1;
 }
 
-
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
+                                     mfn_t smfn, unsigned long off)
+{
+    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
+    
+    ASSERT(mfn_valid(smfn));
+    ASSERT(mfn_valid(gmfn));
+    
+    if ( sp->type == SH_type_l1_32_shadow )
+    {
+        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
+            (v, gmfn, smfn, off);
+    }
+#if CONFIG_PAGING_LEVELS >= 3
+    else if ( sp->type == SH_type_l1_pae_shadow )
+        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
+            (v, gmfn, smfn, off);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( sp->type == SH_type_l1_64_shadow )
+        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
+            (v, gmfn, smfn, off);
+#endif
+#endif
+    
+    return 0;
+}
+#endif 
 
 /**************************************************************************/
 /* Remove all mappings of a guest frame from the shadow tables.
@@ -2127,6 +2803,36 @@ static void sh_update_paging_modes(struc
     }
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    if ( v->arch.paging.shadow.oos_fixups == NULL )
+    {
+        int i;
+        v->arch.paging.shadow.oos_fixups =
+            alloc_xenheap_pages(SHADOW_OOS_FT_ORDER);
+        if ( v->arch.paging.shadow.oos_fixups == NULL )
+        {
+            SHADOW_ERROR("Could not allocate OOS fixup table"
+                         " for dom %u vcpu %u\n",
+                         v->domain->domain_id, v->vcpu_id);
+            domain_crash(v->domain);
+            return;
+        }
+        for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
+            v->arch.paging.shadow.oos_fixups[i].gmfn = _mfn(INVALID_MFN);
+    }
+     
+    if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
+    {
+        int i;
+        for(i = 0; i < SHADOW_OOS_PAGES; i++)
+        {
+            shadow_prealloc(d, SH_type_oos_snapshot, 1);
+            v->arch.paging.shadow.oos_snapshot[i] =
+                shadow_alloc(d, SH_type_oos_snapshot, 0);
+        }
+    }
+#endif /* OOS */
+
     // Valid transitions handled by this function:
     // - For PV guests:
     //     - after a shadow mode has been changed
@@ -2158,6 +2864,13 @@ static void sh_update_paging_modes(struc
         ///
         ASSERT(shadow_mode_translate(d));
         ASSERT(shadow_mode_external(d));
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        /* Need to resync all our pages now, because if a page goes out
+         * of sync with paging enabled and is resynced with paging
+         * disabled, the resync will go wrong. */
+        shadow_resync_all(v, 0);
+#endif /* OOS */
 
         if ( !hvm_paging_enabled(v) )
         {
@@ -2254,6 +2967,27 @@ static void sh_update_paging_modes(struc
         //        This *does* happen, at least for CR4.PGE...
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* We need to check that all the vcpus have paging enabled to
+     * unsync PTs. */
+    if ( is_hvm_domain(d) )
+    {
+        int pe = 1;
+        struct vcpu *vptr;
+
+        for_each_vcpu(d, vptr)
+        {
+            if ( !hvm_paging_enabled(vptr) )
+            {
+                pe = 0;
+                break;
+            }
+        }
+
+        d->arch.paging.shadow.oos_active = pe;
+    }
+#endif /* OOS */
+
     v->arch.paging.mode->update_cr3(v, 0);
 }
 
@@ -2426,17 +3160,36 @@ void shadow_teardown(struct domain *d)
         }
     }
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) 
+#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
     /* Free the virtual-TLB array attached to each vcpu */
     for_each_vcpu(d, v)
     {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
         if ( v->arch.paging.vtlb )
         {
             xfree(v->arch.paging.vtlb);
             v->arch.paging.vtlb = NULL;
         }
-    }
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        if ( v->arch.paging.shadow.oos_fixups )
+        {
+            free_xenheap_pages(v->arch.paging.shadow.oos_fixups,
+                               SHADOW_OOS_FT_ORDER);
+            v->arch.paging.shadow.oos_fixups = NULL;
+        }
+
+        {
+            int i;
+            mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+            for(i = 0; i < SHADOW_OOS_PAGES; i++)
+                if ( mfn_valid(oos_snapshot[i]) )
+                    shadow_free(d, oos_snapshot[i]);
+        }
+#endif /* OOS */
+    }
+#endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
 
     list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
     {
@@ -3044,7 +3797,11 @@ void shadow_audit_tables(struct vcpu *v)
 
     if ( !(SHADOW_AUDIT_ENABLE) )
         return;
-    
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    sh_oos_audit(v->domain);
+#endif
+
     if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
         mask = ~1; /* Audit every table in the system */
     else 
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/multi.c    Wed Jul 02 11:30:37 2008 +0900
@@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig
 }
 
 /* Remove write access permissions from a gwalk_t in a batch, and
- * return OR-ed result for TLB flush hint
+ * return OR-ed result for TLB flush hint and need to rewalk the guest
+ * pages.
+ *
+ * Syncing pages will remove write access to that page; but it may
+ * also give write access to other pages in the path. If we resync any
+ * pages, re-walk from the beginning.
  */
+#define GW_RMWR_FLUSHTLB 1
+#define GW_RMWR_REWALK   2
+
 static inline uint32_t
 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
 {
-    int rc = 0;
+    uint32_t rc = 0;
 
 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
-#endif
-    rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
-#endif
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_is_out_of_sync(gw->l3mfn) )
+    {
+        sh_resync(v, gw->l3mfn);
+        rc = GW_RMWR_REWALK;
+    }
+    else
+#endif /* OOS */
+     if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
+         rc = GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_is_out_of_sync(gw->l2mfn) )
+    {
+        sh_resync(v, gw->l2mfn);
+        rc |= GW_RMWR_REWALK;
+    }
+    else
+#endif /* OOS */
+    if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
+        rc |= GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
     if ( !(guest_supports_superpages(v) &&
-           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
-        rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+         && !mfn_is_out_of_sync(gw->l1mfn)
+#endif /* OOS */
+         && sh_remove_write_access(v, gw->l1mfn, 1, va) )
+        rc |= GW_RMWR_FLUSHTLB;
 
     return rc;
 }
@@ -882,7 +914,12 @@ _sh_propagate(struct vcpu *v,
     
     // protect guest page tables
     //
-    if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
+    if ( unlikely((level == 1) 
+                  && sh_mfn_is_a_page_table(target_mfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+                  && !mfn_oos_may_write(target_mfn)
+#endif /* OOS */
+                  ) )
     {
         if ( shadow_mode_trap_reads(d) )
         {
@@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
         }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+        shadow_resync_all(v, 0);
+#endif
     }
 
     /* Write the new entry */
@@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v
              | (((unsigned long)sl3e) & ~PAGE_MASK));
     
     if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+    {
         /* About to install a new reference */        
         if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
         {
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
-        } 
+        }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+        shadow_resync_all(v, 0);
+#endif
+    }
 
     /* Write the new entry */
     shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
@@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v
              | (((unsigned long)sl2e) & ~PAGE_MASK));
 
     if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
+    {
+        mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
+
         /* About to install a new reference */
-        if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
+        if ( !sh_get_ref(v, sl1mfn, paddr) )
         {
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
-        } 
+        }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+        {
+            struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
+            mfn_t gl1mfn = _mfn(sp->backpointer);
+
+            /* If the shadow is a fl1 then the backpointer contains
+               the GFN instead of the GMFN, and it's definitely not
+               OOS. */
+            if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
+                 && mfn_is_out_of_sync(gl1mfn) )
+                sh_resync(v, gl1mfn);
+        }
+#endif
+    }
 
     /* Write the new entry */
 #if GUEST_PAGING_LEVELS == 2
@@ -1347,6 +1409,9 @@ static int shadow_set_l1e(struct vcpu *v
     int flags = 0;
     struct domain *d = v->domain;
     shadow_l1e_t old_sl1e;
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+    mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
+#endif
     ASSERT(sl1e != NULL);
     
     old_sl1e = *sl1e;
@@ -1363,8 +1428,18 @@ static int shadow_set_l1e(struct vcpu *v
                 /* Doesn't look like a pagetable. */
                 flags |= SHADOW_SET_ERROR;
                 new_sl1e = shadow_l1e_empty();
-            } else {
+            }
+            else
+            {
                 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+                if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
+                     && (shadow_l1e_get_flags(new_sl1e) & _PAGE_RW) )
+                {
+                    oos_fixup_add(v, new_gmfn, sl1mfn, 
pgentry_ptr_to_slot(sl1e));
+                }
+#endif
+
             }
         }
     } 
@@ -2532,6 +2607,9 @@ static int validate_gl1e(struct vcpu *v,
     mfn_t gmfn;
     p2m_type_t p2mt;
     int result = 0;
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    mfn_t gl1mfn;
+#endif /* OOS */
 
     perfc_incr(shadow_validate_gl1e_calls);
 
@@ -2539,10 +2617,138 @@ static int validate_gl1e(struct vcpu *v,
     gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
 
     l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
+    result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+    if ( mfn_valid(gl1mfn) 
+         && mfn_is_out_of_sync(gl1mfn) )
+    {
+        /* Update the OOS snapshot. */
+        mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
+        guest_l1e_t *snp;
+
+        ASSERT(mfn_valid(snpmfn));
+
+        snp = sh_map_domain_page(snpmfn);
+        snp[guest_index(new_ge)] = new_gl1e;
+        sh_unmap_domain_page(snp);
+    }
+#endif /* OOS */
+
+    return result;
+}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Special validation function for re-syncing out-of-sync shadows. 
+ * Walks the *shadow* page, and for every entry that it finds,
+ * revalidates the guest entry that corresponds to it.
+ * N.B. This function is called with the vcpu that unsynced the page,
+ *      *not* the one that is causing it to be resynced. */
+void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
+{
+    mfn_t sl1mfn;
+    shadow_l1e_t *sl1p;
+    guest_l1e_t *gl1p, *gp, *snp;
+    int rc = 0;
+
+    ASSERT(mfn_valid(snpmfn));
+
+    sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+    ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
+
+    snp = sh_map_domain_page(snpmfn);
+    gp = sh_map_domain_page(gl1mfn);
+    gl1p = gp;
+
+   SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
+        guest_l1e_t gl1e = *gl1p;
+        guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
+
+        if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
+        {
+            gfn_t gfn;
+            mfn_t gmfn;
+            p2m_type_t p2mt;
+            shadow_l1e_t nsl1e;
+
+            gfn = guest_l1e_get_gfn(gl1e);
+            gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
+            l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
+            rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
+            
+            *snpl1p = gl1e;
+        }
+    });
+
+    sh_unmap_domain_page(gp);
+    sh_unmap_domain_page(snp);
+
+    /* Setting shadow L1 entries should never need us to flush the TLB */
+    ASSERT(!(rc & SHADOW_SET_FLUSH));
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table. 
+ * That is: if we can tell that it's only used once, and that the 
+ * toplevel shadow responsible is not one of ours. 
+ * N.B. This function is called with the vcpu that required the resync, 
+ *      *not* the one that originally unsynced the page, but it is
+ *      called in the *mode* of the vcpu that unsynced it.  Clear?  Good. */
+int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+    struct shadow_page_info *sp;
+    mfn_t smfn;
+
+    smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+    ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
     
-    result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
-    return result;
-}
+    /* Up to l2 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+#if (SHADOW_PAGING_LEVELS == 4) 
+    /* up to l3 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+    /* up to l4 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 
+         || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+#if (GUEST_PAGING_LEVELS == 2)
+    /* In 2-on-3 shadow mode the up pointer contains the link to the
+     * shadow page, but the shadow_table contains only the first of the
+     * four pages that makes the PAE top shadow tables. */
+    smfn = _mfn(mfn_x(smfn) & ~0x3UL);
+#endif
+
+#endif
+
+    if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
+#if (SHADOW_PAGING_LEVELS == 3) 
+         || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
+         || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
+         || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) 
+#endif
+        )
+        return 0;
+    
+    /* Only in use in one toplevel shadow, and it's not the one we're 
+     * running on */
+    return 1;
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 
 /**************************************************************************/
@@ -2725,6 +2931,10 @@ static void sh_prefetch(struct vcpu *v, 
     shadow_l1e_t sl1e;
     u32 gflags;
     p2m_type_t p2mt;
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    guest_l1e_t *snpl1p = NULL;
+#endif /* OOS */
+
 
     /* Prefetch no further than the end of the _shadow_ l1 MFN */
     dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
@@ -2737,6 +2947,17 @@ static void sh_prefetch(struct vcpu *v, 
         /* Normal guest page; grab the next guest entry */
         gl1p = sh_map_domain_page(gw->l1mfn);
         gl1p += guest_l1_table_offset(gw->va);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+        if ( mfn_is_out_of_sync(gw->l1mfn) )
+        {
+            mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
+
+            ASSERT(mfn_valid(snpmfn));
+            snpl1p = sh_map_domain_page(snpmfn);
+            snpl1p += guest_l1_table_offset(gw->va);
+        }
+#endif /* OOS */
     }
 
     for ( i = 1; i < dist ; i++ ) 
@@ -2774,9 +2995,18 @@ static void sh_prefetch(struct vcpu *v, 
         /* Propagate the entry.  */
         l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
         (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+        if ( snpl1p != NULL )
+            snpl1p[i] = gl1e;
+#endif /* OOS */
     }
     if ( gl1p != NULL )
         sh_unmap_domain_page(gl1p);
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( snpl1p != NULL )
+        sh_unmap_domain_page(snpl1p);
+#endif /* OOS */
 }
 
 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
@@ -2805,6 +3035,7 @@ static int sh_page_fault(struct vcpu *v,
     int r;
     fetch_type_t ft = 0;
     p2m_type_t p2mt;
+    uint32_t rc;
 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
     int fast_emul = 0;
 #endif
@@ -2830,6 +3061,17 @@ static int sh_page_fault(struct vcpu *v,
         {
             fast_emul = 1;
             gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+            /* Fall back to the slow path if we're trying to emulate
+               writes to an out of sync page. */
+            if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
+            {
+                v->arch.paging.last_write_emul_ok = 0;
+                goto page_fault_slow_path;
+            }
+#endif /* OOS */
+
             perfc_incr(shadow_fault_fast_emulate);
             goto early_emulation;
         }
@@ -2855,6 +3097,31 @@ static int sh_page_fault(struct vcpu *v,
                                       sizeof(sl1e)) == 0)
                     && sh_l1e_is_magic(sl1e)) )
         {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+             /* First, need to check that this isn't an out-of-sync
+              * shadow l1e.  If it is, we fall back to the slow path, which
+              * will sync it up again. */
+            {
+                shadow_l2e_t sl2e;
+                mfn_t gl1mfn;
+               if ( (__copy_from_user(&sl2e,
+                                       (sh_linear_l2_table(v)
+                                        + shadow_l2_linear_offset(va)),
+                                       sizeof(sl2e)) != 0)
+                     || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
+                     || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
+                                      shadow_l2e_get_mfn(sl2e))->backpointer))
+                     || unlikely(mfn_is_out_of_sync(gl1mfn)) )
+               {
+                   /* Hit the slow path as if there had been no 
+                    * shadow entry at all, and let it tidy up */
+                   ASSERT(regs->error_code & PFEC_page_present);
+                   regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
+                   goto page_fault_slow_path;
+               }
+            }
+#endif /* SHOPT_OUT_OF_SYNC */
+
             if ( sh_l1e_is_gnp(sl1e) )
             {
                 /* Not-present in a guest PT: pass to the guest as
@@ -2890,6 +3157,10 @@ static int sh_page_fault(struct vcpu *v,
             return EXCRET_fault_fixed;
         }
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+ page_fault_slow_path:
+#endif
 #endif /* SHOPT_FAST_FAULT_PATH */
 
     /* Detect if this page fault happened while we were already in Xen
@@ -2904,7 +3175,21 @@ static int sh_page_fault(struct vcpu *v,
         return 0;
     }
 
-    if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
+ rewalk:
+    rc = guest_walk_tables(v, va, &gw, regs->error_code);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( !(rc & _PAGE_PRESENT) )
+        regs->error_code |= PFEC_page_present;
+    else if ( regs->error_code & PFEC_page_present )
+    {
+            SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB"
+                         " flushing. Have fun debugging it.\n");
+            regs->error_code &= ~PFEC_page_present;
+    }
+#endif
+
+    if ( rc != 0 )
     {
         perfc_incr(shadow_fault_bail_real_fault);
         SHADOW_PRINTK("not a shadow fault\n");
@@ -2948,7 +3233,10 @@ static int sh_page_fault(struct vcpu *v,
 
     shadow_lock(d);
 
-    if ( gw_remove_write_accesses(v, va, &gw) )
+    rc = gw_remove_write_accesses(v, va, &gw);
+
+    /* First bit set: Removed write access to a page. */
+    if ( rc & GW_RMWR_FLUSHTLB )
     {
         /* Write permission removal is also a hint that other gwalks
          * overlapping with this one may be inconsistent
@@ -2958,11 +3246,20 @@ static int sh_page_fault(struct vcpu *v,
         flush_tlb_mask(d->domain_dirty_cpumask);
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Second bit set: Resynced a page. Re-walk needed. */
+    if ( rc & GW_RMWR_REWALK )
+    {
+        shadow_unlock(d);
+        goto rewalk;
+    }
+#endif /* OOS */
+
     if ( !shadow_check_gwalk(v, va, &gw) )
     {
         perfc_incr(shadow_inconsistent_gwalk);
         shadow_unlock(d);
-        return EXCRET_fault_fixed;
+        goto rewalk;
     }
 
     shadow_audit_tables(v);
@@ -2991,17 +3288,45 @@ static int sh_page_fault(struct vcpu *v,
         return 0;
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Always unsync when writing to L1 page tables. */
+    if ( sh_mfn_is_a_page_table(gmfn)
+         && ft == ft_demand_write )
+        sh_unsync(v, gmfn, va);
+#endif /* OOS */
+
     /* Calculate the shadow entry and write it */
     l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
     r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_valid(gw.l1mfn) 
+         && mfn_is_out_of_sync(gw.l1mfn) )
+    {
+        /* Update the OOS snapshot. */
+        mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
+        guest_l1e_t *snp;
+        
+        ASSERT(mfn_valid(snpmfn));
+        
+        snp = sh_map_domain_page(snpmfn);
+        snp[guest_l1_table_offset(va)] = gw.l1e;
+        sh_unmap_domain_page(snp);
+    }
+#endif /* OOS */
+
 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
     /* Prefetch some more shadow entries */
     sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
 #endif
 
     /* Need to emulate accesses to page tables */
-    if ( sh_mfn_is_a_page_table(gmfn) )
+    if ( sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+         /* Unless they've been allowed to go out of sync with their shadows */
+         && !mfn_is_out_of_sync(gmfn)
+#endif
+         )
     {
         if ( ft == ft_demand_write )
         {
@@ -3215,6 +3540,7 @@ sh_invlpg(struct vcpu *v, unsigned long 
  * instruction should be issued on the hardware, or 0 if it's safe not
  * to do so. */
 {
+    mfn_t sl1mfn;
     shadow_l2e_t sl2e;
     
     perfc_incr(shadow_invlpg);
@@ -3278,12 +3604,64 @@ sh_invlpg(struct vcpu *v, unsigned long 
     // If so, then we'll need to flush the entire TLB (because that's
     // easier than invalidating all of the individual 4K pages).
     //
-    if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
+    sl1mfn = shadow_l2e_get_mfn(sl2e);
+    if ( mfn_to_shadow_page(sl1mfn)->type
          == SH_type_fl1_shadow )
     {
         flush_tlb_local();
         return 0;
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Check to see if the SL1 is out of sync. */
+    {
+        mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+        struct page_info *pg = mfn_to_page(gl1mfn);
+        if ( mfn_valid(gl1mfn) 
+             && page_is_out_of_sync(pg) )
+        {
+            /* The test above may give false positives, since we don't
+             * hold the shadow lock yet.  Check again with the lock held. */
+            shadow_lock(v->domain);
+
+            /* This must still be a copy-from-user because we didn't
+             * have the shadow lock last time we checked, and the
+             * higher-level shadows might have disappeared under our
+             * feet. */
+            if ( __copy_from_user(&sl2e, 
+                                  sh_linear_l2_table(v)
+                                  + shadow_l2_linear_offset(va),
+                                  sizeof (sl2e)) != 0 )
+            {
+                perfc_incr(shadow_invlpg_fault);
+                shadow_unlock(v->domain);
+                return 0;
+            }
+
+            if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
+            {
+                shadow_unlock(v->domain);
+                return 0;
+            }
+
+            sl1mfn = shadow_l2e_get_mfn(sl2e);
+            gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+            pg = mfn_to_page(gl1mfn);
+            
+            if ( likely(sh_mfn_is_a_page_table(gl1mfn)
+                        && page_is_out_of_sync(pg) ) )
+            {
+                shadow_l1e_t *sl1;
+                sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
+                /* Remove the shadow entry that maps this VA */
+                (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
+            }
+            shadow_unlock(v->domain);
+            /* Need the invlpg, to pick up the disappeareance of the sl1e */
+            return 1;
+        }
+    }
+#endif
 
     return 1;
 }
@@ -3710,6 +4088,13 @@ sh_update_cr3(struct vcpu *v, int do_loc
         return;
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Need to resync all the shadow entries on a TLB flush.  Resync
+     * current vcpus OOS pages before switching to the new shadow
+     * tables so that the VA hint is still valid.  */
+    shadow_resync_current_vcpu(v, do_locking);
+#endif
+
     if ( do_locking ) shadow_lock(v->domain);
 
     ASSERT(shadow_locked_by_me(v->domain));
@@ -3938,11 +4323,70 @@ sh_update_cr3(struct vcpu *v, int do_loc
 
     /* Release the lock, if we took it (otherwise it's the caller's problem) */
     if ( do_locking ) shadow_unlock(v->domain);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Need to resync all the shadow entries on a TLB flush. We only
+     * update the shadows, leaving the pages out of sync. Also, we try
+     * to skip synchronization of shadows not mapped in the new
+     * tables. */
+    shadow_sync_other_vcpus(v, do_locking);
+#endif
+
 }
 
 
 /**************************************************************************/
 /* Functions to revoke guest rights */
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, 
+                                 mfn_t smfn, unsigned long off)
+{
+    int r;
+    shadow_l1e_t *sl1p, sl1e;
+    struct shadow_page_info *sp;
+
+    ASSERT(mfn_valid(gmfn));
+    ASSERT(mfn_valid(smfn));
+
+    sp = mfn_to_shadow_page(smfn);
+
+    if ( sp->mbz != 0 ||
+#if GUEST_PAGING_LEVELS == 4
+         (sp->type != SH_type_l1_64_shadow)
+#elif GUEST_PAGING_LEVELS == 3
+         (sp->type != SH_type_l1_pae_shadow)
+#elif GUEST_PAGING_LEVELS == 2
+         (sp->type != SH_type_l1_32_shadow)
+#endif
+       )
+        goto fail;
+
+    sl1p = sh_map_domain_page(smfn);
+    sl1p += off;
+    sl1e = *sl1p;
+    if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
+          != (_PAGE_PRESENT|_PAGE_RW))
+         || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
+    {
+        sh_unmap_domain_page(sl1p);
+        goto fail;
+    }
+
+    /* Found it!  Need to remove its write permissions. */
+    sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
+    r = shadow_set_l1e(v, sl1p, sl1e, smfn);
+    ASSERT( !(r & SHADOW_SET_ERROR) );
+
+    sh_unmap_domain_page(sl1p);
+    perfc_incr(shadow_writeable_h_7);
+    return 1;
+
+ fail:
+    perfc_incr(shadow_writeable_h_8);
+    return 0;
+}
+#endif /* OOS */
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
@@ -4437,23 +4881,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v,
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 
-#define AUDIT_FAIL(_level, _fmt, _a...) do {                               \
-    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"         \
-           "gl" #_level "mfn = %" PRI_mfn                              \
-           " sl" #_level "mfn = %" PRI_mfn                             \
-           " &gl" #_level "e = %p &sl" #_level "e = %p"                    \
-           " gl" #_level "e = %" SH_PRI_gpte                              \
-           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",        \
-           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                      \
-           _level, guest_index(gl ## _level ## e),                         \
-           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),         \
-           gl ## _level ## e, sl ## _level ## e,                           \
-           gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
-           ##_a);                                                          \
-    BUG();                                                                 \
-    done = 1;                                                              \
+#define AUDIT_FAIL(_level, _fmt, _a...) do {                            \
+    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"       \
+           "gl" #_level "mfn = %" PRI_mfn                               \
+           " sl" #_level "mfn = %" PRI_mfn                              \
+           " &gl" #_level "e = %p &sl" #_level "e = %p"                 \
+           " gl" #_level "e = %" SH_PRI_gpte                            \
+           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",      \
+           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
+               _level, guest_index(gl ## _level ## e),                  \
+               mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),  \
+               gl ## _level ## e, sl ## _level ## e,                    \
+               gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, 
\
+               ##_a);                                                   \
+        BUG();                                                          \
+        done = 1;                                                       \
 } while (0)
 
+#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do {                        \
+    printk("Shadow %u-on-%u audit failed at level %i\n"                 \
+           "gl" #_level "mfn = %" PRI_mfn                               \
+           " sl" #_level "mfn = %" PRI_mfn                              \
+           " Error: " _fmt "\n",                                        \
+           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
+           _level,                                                      \
+           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),      \
+           ##_a);                                                       \
+    BUG();                                                              \
+    done = 1;                                                           \
+} while (0)
 
 static char * sh_audit_flags(struct vcpu *v, int level,
                               int gflags, int sflags) 
@@ -4494,6 +4950,16 @@ int sh_audit_l1_table(struct vcpu *v, mf
     
     /* Follow the backpointer */
     gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
+    if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
+    {
+        oos_audit_hash_is_present(v->domain, gl1mfn);
+        return 0;
+    }
+#endif
+
     gl1e = gp = sh_map_domain_page(gl1mfn);
     SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
 
@@ -4574,6 +5040,13 @@ int sh_audit_l2_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
+        AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
+#endif
+
     gl2e = gp = sh_map_domain_page(gl2mfn);
     SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
 
@@ -4616,6 +5089,13 @@ int sh_audit_l3_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
+        AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
+#endif
+
     gl3e = gp = sh_map_domain_page(gl3mfn);
     SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
 
@@ -4656,6 +5136,13 @@ int sh_audit_l4_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
+        AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
+#endif
+
     gl4e = gp = sh_map_domain_page(gl4mfn);
     SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
     {
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/multi.h
--- a/xen/arch/x86/mm/shadow/multi.h    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/multi.h    Wed Jul 02 11:30:37 2008 +0900
@@ -115,3 +115,17 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_
 
 extern struct paging_mode 
 SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS);
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+extern void 
+SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS)
+     (struct vcpu *v, mfn_t gmfn, mfn_t snpmfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS)
+     (struct vcpu*v, mfn_t gmfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p, GUEST_LEVELS)
+     (struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
+#endif
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/private.h  Wed Jul 02 11:30:37 2008 +0900
@@ -63,8 +63,9 @@ extern int shadow_audit_enable;
 #define SHOPT_SKIP_VERIFY         0x20  /* Skip PTE v'fy when safe to do so */
 #define SHOPT_VIRTUAL_TLB         0x40  /* Cache guest v->p translations */
 #define SHOPT_FAST_EMULATION      0x80  /* Fast write emulation */
-
-#define SHADOW_OPTIMIZATIONS      0xff
+#define SHOPT_OUT_OF_SYNC        0x100  /* Allow guest writes to L1 PTs */
+
+#define SHADOW_OPTIMIZATIONS     0x1ff
 
 
 /******************************************************************************
@@ -195,9 +196,9 @@ struct shadow_page_info
         u32 tlbflush_timestamp;
     };
     struct {
-        unsigned int type:4;      /* What kind of shadow is this? */
+        unsigned int type:5;      /* What kind of shadow is this? */
         unsigned int pinned:1;    /* Is the shadow pinned? */
-        unsigned int count:27;    /* Reference count */
+        unsigned int count:26;    /* Reference count */
         u32 mbz;                  /* Must be zero: this is where the owner 
                                    * field lives in a non-shadow page */
     } __attribute__((packed));
@@ -242,7 +243,8 @@ static inline void shadow_check_page_str
 #define SH_type_max_shadow    (13U)
 #define SH_type_p2m_table     (14U) /* in use as the p2m table */
 #define SH_type_monitor_table (15U) /* in use as a monitor table */
-#define SH_type_unused        (16U)
+#define SH_type_oos_snapshot  (16U) /* in use as OOS snapshot */
+#define SH_type_unused        (17U)
 
 /* 
  * What counts as a pinnable shadow?
@@ -301,6 +303,72 @@ static inline int sh_type_is_pinnable(st
 #define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE)
 #define SHF_64  (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64)
 
+#define SHF_L1_ANY  (SHF_L1_32|SHF_L1_PAE|SHF_L1_64)
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+/* Marks a guest L1 page table which is shadowed but not write-protected.
+ * If set, then *only* L1 shadows (SHF_L1_*) are allowed. 
+ *
+ * out_of_sync indicates that the shadow tables may not reflect the
+ * guest tables.  If it is clear, then the shadow tables *must* reflect
+ * the guest tables.
+ *
+ * oos_may_write indicates that a page may have writable mappings.
+ *
+ * Most of the time the flags are synonymous.  There is a short period of time 
+ * during resync that oos_may_write is clear but out_of_sync is not.  If a 
+ * codepath is called during that time and is sensitive to oos issues, it may 
+ * need to use the second flag.
+ */
+#define SHF_out_of_sync (1u<<30)
+#define SHF_oos_may_write (1u<<29)
+
+/* Fixup tables are a non-complete writable-mappings reverse map for
+   OOS pages. This let us quickly resync pages (avoiding brute-force
+   search of the shadows) when the va hint is not sufficient (i.e.,
+   the pagetable is mapped in multiple places and in multiple
+   shadows.) */
+#define SHADOW_OOS_FT_ENTRIES                           \
+    ((PAGE_SIZE << SHADOW_OOS_FT_ORDER)                 \
+     / (SHADOW_OOS_FT_HASH * sizeof(struct oos_fixup)))
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
+static inline int sh_page_has_multiple_shadows(struct page_info *pg)
+{
+    u32 shadows;
+    if ( !(pg->count_info & PGC_page_table) )
+        return 0;
+    shadows = pg->shadow_flags & SHF_page_type_mask;
+    /* More than one type bit set in shadow-flags? */
+    return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 );
+}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+/* The caller must verify this is reasonable to call; i.e., valid mfn,
+ * domain is translated, &c */
+static inline int page_is_out_of_sync(struct page_info *p) 
+{
+    return (p->count_info & PGC_page_table)
+        && (p->shadow_flags & SHF_out_of_sync);
+}
+
+static inline int mfn_is_out_of_sync(mfn_t gmfn) 
+{
+    return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn)));
+}
+
+static inline int page_oos_may_write(struct page_info *p) 
+{
+    return (p->count_info & PGC_page_table)
+        && (p->shadow_flags & SHF_oos_may_write);
+}
+
+static inline int mfn_oos_may_write(mfn_t gmfn) 
+{
+    return page_oos_may_write(mfn_to_page(mfn_x(gmfn)));
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 /******************************************************************************
  * Various function declarations 
@@ -351,7 +419,57 @@ int shadow_cmpxchg_guest_entry(struct vc
 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
                                intpte_t *old, intpte_t new, mfn_t gmfn);
 
-
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va);
+
+/* Pull an out-of-sync page back into sync. */
+void sh_resync(struct vcpu *v, mfn_t gmfn);
+
+void oos_fixup_add(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
+
+int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
+                                     mfn_t smfn, unsigned long offset);
+
+/* Pull all out-of-sync shadows back into sync.  If skip != 0, we try
+ * to avoid resyncing where we think we can get away with it. */
+
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int 
do_locking);
+
+static inline void
+shadow_resync_all(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  0 /* skip */,
+                  1 /* this */,
+                  1 /* others */,
+                  do_locking);
+}
+
+static inline void
+shadow_resync_current_vcpu(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  0 /* skip */,
+                  1 /* this */, 
+                  0 /* others */,
+                  do_locking);
+}
+
+static inline void
+shadow_sync_other_vcpus(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  1 /* skip */, 
+                  0 /* this */,
+                  1 /* others */,
+                  do_locking);
+}
+
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn);
+mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn);
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 /******************************************************************************
  * Flags used in the return value of the shadow_set_lXe() functions...
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/types.h
--- a/xen/arch/x86/mm/shadow/types.h    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/types.h    Wed Jul 02 11:30:37 2008 +0900
@@ -438,6 +438,11 @@ struct shadow_walk_t
 #define sh_guess_wrmap             INTERNAL_NAME(sh_guess_wrmap)
 #define sh_clear_shadow_entry      INTERNAL_NAME(sh_clear_shadow_entry)
 
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+#define sh_resync_l1               INTERNAL_NAME(sh_resync_l1)
+#define sh_safe_not_to_sync        INTERNAL_NAME(sh_safe_not_to_sync)
+#define sh_rm_write_access_from_sl1p 
INTERNAL_NAME(sh_rm_write_access_from_sl1p)
+#endif
 
 /* The sh_guest_(map|get)_* functions depends on Xen's paging levels */
 #define sh_guest_map_l1e \
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/platform_hypercall.c Wed Jul 02 11:30:37 2008 +0900
@@ -408,7 +408,12 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
                 cpu_count++;
             }
             if ( cpu_count == num_online_cpus() )
-                ret = acpi_cpufreq_init();
+            {
+                if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
+                    ret = powernow_cpufreq_init();
+                else
+                    ret = acpi_cpufreq_init();
+            }
             break;
         }
  
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/x86_emulate/x86_emulate.c
--- a/xen/arch/x86/x86_emulate/x86_emulate.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c    Wed Jul 02 11:30:37 2008 +0900
@@ -142,12 +142,14 @@ static uint8_t opcode_table[256] = {
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD0 - 0xD7 */
-    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 
-    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 
+    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
+    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD8 - 0xDF */
-    0, ImplicitOps|ModRM|Mov, 0, ImplicitOps|ModRM|Mov,
-    0, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
     /* 0xE0 - 0xE7 */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
@@ -216,7 +218,7 @@ static uint8_t twobyte_table[256] = {
     ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
     /* 0xA0 - 0xA7 */
     ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0, 
+    DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0,
     /* 0xA8 - 0xAF */
     ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM,
     DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, DstReg|SrcMem|ModRM,
@@ -246,8 +248,20 @@ static uint8_t twobyte_table[256] = {
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
     enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
-    unsigned int  bytes;
-    unsigned long val, orig_val;
+    unsigned int bytes;
+
+    /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
+    union {
+        unsigned long val;
+        uint32_t bigval[4];
+    };
+
+    /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
+    union {
+        unsigned long orig_val;
+        uint32_t orig_bigval[4];
+    };
+
     union {
         /* OP_REG: Pointer to register field. */
         unsigned long *reg;
@@ -466,7 +480,7 @@ do{ asm volatile (                      
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch_bytes(_size)                                         \
-({ unsigned long _x, _eip = _regs.eip;                                  \
+({ unsigned long _x = 0, _eip = _regs.eip;                              \
    if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \
    _regs.eip += (_size); /* real hardware doesn't truncate */           \
    generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15,   \
@@ -594,6 +608,18 @@ do{ struct fpu_insn_ctxt fic;           
     put_fpu(&fic);                                      \
 } while (0)
 
+#define emulate_fpu_insn_memsrc(_op, _arg)              \
+do{ struct fpu_insn_ctxt fic;                           \
+    get_fpu(X86EMUL_FPU_fpu, &fic);                     \
+    asm volatile (                                      \
+        "movb $2f-1f,%0 \n"                             \
+        "1: " _op " %1  \n"                             \
+        "2:             \n"                             \
+        : "=m" (fic.insn_bytes)                         \
+        : "m" (_arg) : "memory" );                      \
+    put_fpu(&fic);                                      \
+} while (0)
+
 #define emulate_fpu_insn_stub(_bytes...)                                \
 do{ uint8_t stub[] = { _bytes, 0xc3 };                                  \
     struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };        \
@@ -654,6 +680,19 @@ static void __put_rep_prefix(
     if ( rep_prefix )                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
 })
+
+/* Compatibility function: read guest memory, zero-extend result to a ulong. */
+static int read_ulong(
+        enum x86_segment seg,
+        unsigned long offset,
+        unsigned long *val,
+        unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt,
+        struct x86_emulate_ops *ops)
+{
+    *val = 0;
+    return ops->read(seg, offset, val, bytes, ctxt);
+}
 
 /*
  * Unsigned multiplication with double-word result.
@@ -841,7 +880,8 @@ static int ioport_access_check(
          (tr.limit < 0x67) )
         goto raise_exception;
 
-    if ( (rc = ops->read(x86_seg_none, tr.base + 0x66, &iobmp, 2, ctxt)) )
+    if ( (rc = read_ulong(x86_seg_none, tr.base + 0x66,
+                          &iobmp, 2, ctxt, ops)) )
         return rc;
 
     /* Ensure TSS includes two bytes including byte containing first port. */
@@ -849,7 +889,8 @@ static int ioport_access_check(
     if ( tr.limit <= iobmp )
         goto raise_exception;
 
-    if ( (rc = ops->read(x86_seg_none, tr.base + iobmp, &iobmp, 2, ctxt)) )
+    if ( (rc = read_ulong(x86_seg_none, tr.base + iobmp,
+                          &iobmp, 2, ctxt, ops)) )
         return rc;
     if ( (iobmp & (((1<<bytes)-1) << (first_port&7))) != 0 )
         goto raise_exception;
@@ -941,12 +982,12 @@ protmode_load_seg(
         goto raise_exn;
 
     do {
-        if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8),
-                             &val, 4, ctxt)) )
+        if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8),
+                              &val, 4, ctxt, ops)) )
             return rc;
         desc.a = val;
-        if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
-                             &val, 4, ctxt)) )
+        if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
+                              &val, 4, ctxt, ops)) )
             return rc;
         desc.b = val;
 
@@ -992,14 +1033,15 @@ protmode_load_seg(
             if ( (desc.b & (5u<<9)) == (4u<<9) )
                 goto raise_exn;
             /* Non-conforming segment: check DPL against RPL and CPL. */
-            if ( ((desc.b & (6u<<9)) != (6u<<9)) && ((dpl < cpl) || (dpl < 
rpl)) )
+            if ( ((desc.b & (6u<<9)) != (6u<<9)) &&
+                 ((dpl < cpl) || (dpl < rpl)) )
                 goto raise_exn;
             break;
         }
 
         /* Ensure Accessed flag is set. */
         new_desc_b = desc.b | 0x100;
-        rc = ((desc.b & 0x100) ? X86EMUL_OKAY : 
+        rc = ((desc.b & 0x100) ? X86EMUL_OKAY :
               ops->cmpxchg(
                   x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
                   &desc.b, &new_desc_b, 4, ctxt));
@@ -1061,16 +1103,16 @@ decode_register(
     case  2: p = &regs->edx; break;
     case  3: p = &regs->ebx; break;
     case  4: p = (highbyte_regs ?
-                  ((unsigned char *)&regs->eax + 1) : 
+                  ((unsigned char *)&regs->eax + 1) :
                   (unsigned char *)&regs->esp); break;
     case  5: p = (highbyte_regs ?
-                  ((unsigned char *)&regs->ecx + 1) : 
+                  ((unsigned char *)&regs->ecx + 1) :
                   (unsigned char *)&regs->ebp); break;
     case  6: p = (highbyte_regs ?
-                  ((unsigned char *)&regs->edx + 1) : 
+                  ((unsigned char *)&regs->edx + 1) :
                   (unsigned char *)&regs->esi); break;
     case  7: p = (highbyte_regs ?
-                  ((unsigned char *)&regs->ebx + 1) : 
+                  ((unsigned char *)&regs->ebx + 1) :
                   (unsigned char *)&regs->edi); break;
 #if defined(__x86_64__)
     case  8: p = &regs->r8;  break;
@@ -1402,8 +1444,8 @@ x86_emulate(
             case 8: src.val = *(uint64_t *)src.reg; break;
             }
         }
-        else if ( (rc = ops->read(src.mem.seg, src.mem.off,
-                                  &src.val, src.bytes, ctxt)) )
+        else if ( (rc = read_ulong(src.mem.seg, src.mem.off,
+                                   &src.val, src.bytes, ctxt, ops)) )
             goto done;
         break;
     case SrcImm:
@@ -1494,8 +1536,8 @@ x86_emulate(
         }
         else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
         {
-            if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
-                                 &dst.val, dst.bytes, ctxt)) )
+            if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
+                                  &dst.val, dst.bytes, ctxt, ops)) )
                 goto done;
             dst.orig_val = dst.val;
         }
@@ -1571,8 +1613,8 @@ x86_emulate(
         int lb, ub, idx;
         generate_exception_if(mode_64bit() || (src.type != OP_MEM),
                               EXC_UD, -1);
-        if ( (rc = ops->read(src.mem.seg, src.mem.off + op_bytes,
-                             &src_val2, op_bytes, ctxt)) )
+        if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes,
+                              &src_val2, op_bytes, ctxt, ops)) )
             goto done;
         ub  = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2;
         lb  = (op_bytes == 2) ? (int16_t)src.val  : (int32_t)src.val;
@@ -1588,8 +1630,8 @@ x86_emulate(
             /* movsxd */
             if ( src.type == OP_REG )
                 src.val = *(int32_t *)src.reg;
-            else if ( (rc = ops->read(src.mem.seg, src.mem.off,
-                                      &src.val, 4, ctxt)) )
+            else if ( (rc = read_ulong(src.mem.seg, src.mem.off,
+                                       &src.val, 4, ctxt, ops)) )
                 goto done;
             dst.val = (int32_t)src.val;
         }
@@ -1613,8 +1655,8 @@ x86_emulate(
         unsigned long src1; /* ModR/M source operand */
         if ( ea.type == OP_REG )
             src1 = *ea.reg;
-        else if ( (rc = ops->read(ea.mem.seg, ea.mem.off,
-                                  &src1, op_bytes, ctxt)) )
+        else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off,
+                                   &src1, op_bytes, ctxt, ops)) )
             goto done;
         _regs.eflags &= ~(EFLG_OF|EFLG_CF);
         switch ( dst.bytes )
@@ -1720,8 +1762,8 @@ x86_emulate(
         /* 64-bit mode: POP defaults to a 64-bit operand. */
         if ( mode_64bit() && (dst.bytes == 4) )
             dst.bytes = 8;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
-                             &dst.val, dst.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
+                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
             goto done;
         break;
 
@@ -1773,8 +1815,8 @@ x86_emulate(
         dst.val = x86_seg_es;
     les: /* dst.val identifies the segment */
         generate_exception_if(src.type != OP_MEM, EXC_UD, -1);
-        if ( (rc = ops->read(src.mem.seg, src.mem.off + src.bytes,
-                             &sel, 2, ctxt)) != 0 )
+        if ( (rc = read_ulong(src.mem.seg, src.mem.off + src.bytes,
+                              &sel, 2, ctxt, ops)) != 0 )
             goto done;
         if ( (rc = load_seg(dst.val, (uint16_t)sel, ctxt, ops)) != 0 )
             goto done;
@@ -2020,8 +2062,8 @@ x86_emulate(
                 dst.bytes = op_bytes = 8;
                 if ( dst.type == OP_REG )
                     dst.val = *dst.reg;
-                else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
-                                          &dst.val, 8, ctxt)) != 0 )
+                else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
+                                           &dst.val, 8, ctxt, ops)) != 0 )
                     goto done;
             }
             src.val = _regs.eip;
@@ -2036,8 +2078,8 @@ x86_emulate(
 
             generate_exception_if(dst.type != OP_MEM, EXC_UD, -1);
 
-            if ( (rc = ops->read(dst.mem.seg, dst.mem.off+dst.bytes,
-                                 &sel, 2, ctxt)) )
+            if ( (rc = read_ulong(dst.mem.seg, dst.mem.off+dst.bytes,
+                                  &sel, 2, ctxt, ops)) )
                 goto done;
 
             if ( (modrm_reg & 7) == 3 ) /* call */
@@ -2046,9 +2088,9 @@ x86_emulate(
                 fail_if(ops->read_segment == NULL);
                 if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
                      (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                                      reg.sel, op_bytes, ctxt)) ||
+                                      &reg.sel, op_bytes, ctxt)) ||
                      (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                                      _regs.eip, op_bytes, ctxt)) )
+                                      &_regs.eip, op_bytes, ctxt)) )
                     goto done;
             }
 
@@ -2066,12 +2108,12 @@ x86_emulate(
                 dst.bytes = 8;
                 if ( dst.type == OP_REG )
                     dst.val = *dst.reg;
-                else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
-                                          &dst.val, 8, ctxt)) != 0 )
+                else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
+                                           &dst.val, 8, ctxt, ops)) != 0 )
                     goto done;
             }
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
-                                  dst.val, dst.bytes, ctxt)) != 0 )
+                                  &dst.val, dst.bytes, ctxt)) != 0 )
                 goto done;
             dst.type = OP_NONE;
             break;
@@ -2106,7 +2148,7 @@ x86_emulate(
                 &dst.val, dst.bytes, ctxt);
         else
             rc = ops->write(
-                dst.mem.seg, dst.mem.off, dst.val, dst.bytes, ctxt);
+                dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
         if ( rc != 0 )
             goto done;
     default:
@@ -2153,7 +2195,7 @@ x86_emulate(
         if ( mode_64bit() && (op_bytes == 4) )
             op_bytes = 8;
         if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                              reg.sel, op_bytes, ctxt)) != 0 )
+                              &reg.sel, op_bytes, ctxt)) != 0 )
             goto done;
         break;
     }
@@ -2165,8 +2207,8 @@ x86_emulate(
         /* 64-bit mode: POP defaults to a 64-bit operand. */
         if ( mode_64bit() && (op_bytes == 4) )
             op_bytes = 8;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &dst.val, op_bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &dst.val, op_bytes, ctxt, ops)) != 0 )
             goto done;
         if ( (rc = load_seg(src.val, (uint16_t)dst.val, ctxt, ops)) != 0 )
             return rc;
@@ -2275,8 +2317,8 @@ x86_emulate(
         dst.bytes = op_bytes;
         if ( mode_64bit() && (dst.bytes == 4) )
             dst.bytes = 8;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
-                             &dst.val, dst.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
+                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
             goto done;
         break;
 
@@ -2288,7 +2330,7 @@ x86_emulate(
         generate_exception_if(mode_64bit(), EXC_UD, -1);
         for ( i = 0; i < 8; i++ )
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                                  regs[i], op_bytes, ctxt)) != 0 )
+                                  &regs[i], op_bytes, ctxt)) != 0 )
             goto done;
         break;
     }
@@ -2303,8 +2345,8 @@ x86_emulate(
         generate_exception_if(mode_64bit(), EXC_UD, -1);
         for ( i = 0; i < 8; i++ )
         {
-            if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                                 &dst.val, op_bytes, ctxt)) != 0 )
+            if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                                  &dst.val, op_bytes, ctxt, ops)) != 0 )
                 goto done;
             switch ( op_bytes )
             {
@@ -2382,8 +2424,8 @@ x86_emulate(
         }
         else
         {
-            if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
-                                 &dst.val, dst.bytes, ctxt)) != 0 )
+            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
                 goto done;
             fail_if(ops->write_io == NULL);
             if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
@@ -2455,9 +2497,9 @@ x86_emulate(
 
         if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
              (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                              reg.sel, op_bytes, ctxt)) ||
+                              &reg.sel, op_bytes, ctxt)) ||
              (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                              _regs.eip, op_bytes, ctxt)) )
+                              &_regs.eip, op_bytes, ctxt)) )
             goto done;
 
         if ( (rc = load_seg(x86_seg_cs, sel, ctxt, ops)) != 0 )
@@ -2483,8 +2525,8 @@ x86_emulate(
         /* 64-bit mode: POP defaults to a 64-bit operand. */
         if ( mode_64bit() && (op_bytes == 4) )
             op_bytes = 8;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &dst.val, op_bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &dst.val, op_bytes, ctxt, ops)) != 0 )
             goto done;
         if ( op_bytes == 2 )
             dst.val = (uint16_t)dst.val | (_regs.eflags & 0xffff0000u);
@@ -2507,8 +2549,8 @@ x86_emulate(
         dst.type  = OP_REG;
         dst.reg   = (unsigned long *)&_regs.eax;
         dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        if ( (rc = ops->read(ea.mem.seg, insn_fetch_bytes(ad_bytes),
-                             &dst.val, dst.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes),
+                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
             goto done;
         break;
 
@@ -2536,8 +2578,8 @@ x86_emulate(
         }
         else
         {
-            if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
-                                 &dst.val, dst.bytes, ctxt)) != 0 )
+            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
                 goto done;
             dst.type = OP_MEM;
             nr_reps = 1;
@@ -2556,10 +2598,10 @@ x86_emulate(
         unsigned long next_eip = _regs.eip;
         get_rep_prefix();
         src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
-                             &dst.val, dst.bytes, ctxt)) ||
-             (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi),
-                             &src.val, src.bytes, ctxt)) )
+        if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+                              &dst.val, dst.bytes, ctxt, ops)) ||
+             (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi),
+                              &src.val, src.bytes, ctxt, ops)) )
             goto done;
         register_address_increment(
             _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -2592,8 +2634,8 @@ x86_emulate(
         dst.type  = OP_REG;
         dst.bytes = (d & ByteOp) ? 1 : op_bytes;
         dst.reg   = (unsigned long *)&_regs.eax;
-        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
-                             &dst.val, dst.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
             goto done;
         register_address_increment(
             _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -2606,8 +2648,8 @@ x86_emulate(
         get_rep_prefix();
         src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes;
         dst.val = _regs.eax;
-        if ( (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi),
-                             &src.val, src.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi),
+                              &src.val, src.bytes, ctxt, ops)) != 0 )
             goto done;
         register_address_increment(
             _regs.edi, (_regs.eflags & EFLG_DF) ? -src.bytes : src.bytes);
@@ -2624,8 +2666,8 @@ x86_emulate(
     case 0xc3: /* ret (near) */ {
         int offset = (b == 0xc2) ? insn_fetch_type(uint16_t) : 0;
         op_bytes = mode_64bit() ? 8 : op_bytes;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset),
-                             &dst.val, op_bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset),
+                              &dst.val, op_bytes, ctxt, ops)) != 0 )
             goto done;
         _regs.eip = dst.val;
         break;
@@ -2640,7 +2682,7 @@ x86_emulate(
         dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
         dst.reg = (unsigned long *)&_regs.ebp;
         if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
-                              _regs.ebp, dst.bytes, ctxt)) )
+                              &_regs.ebp, dst.bytes, ctxt)) )
             goto done;
         dst.val = _regs.esp;
 
@@ -2650,14 +2692,14 @@ x86_emulate(
             {
                 unsigned long ebp, temp_data;
                 ebp = truncate_word(_regs.ebp - i*dst.bytes, ctxt->sp_size/8);
-                if ( (rc = ops->read(x86_seg_ss, ebp,
-                                     &temp_data, dst.bytes, ctxt)) ||
+                if ( (rc = read_ulong(x86_seg_ss, ebp,
+                                      &temp_data, dst.bytes, ctxt, ops)) ||
                      (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
-                                      temp_data, dst.bytes, ctxt)) )
+                                      &temp_data, dst.bytes, ctxt)) )
                     goto done;
             }
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
-                                  dst.val, dst.bytes, ctxt)) )
+                                  &dst.val, dst.bytes, ctxt)) )
                 goto done;
         }
 
@@ -2683,8 +2725,8 @@ x86_emulate(
 
         /* Second writeback, to %%ebp. */
         dst.reg = (unsigned long *)&_regs.ebp;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
-                             &dst.val, dst.bytes, ctxt)) )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
+                              &dst.val, dst.bytes, ctxt, ops)) )
             goto done;
         break;
 
@@ -2692,10 +2734,10 @@ x86_emulate(
     case 0xcb: /* ret (far) */ {
         int offset = (b == 0xca) ? insn_fetch_type(uint16_t) : 0;
         op_bytes = mode_64bit() ? 8 : op_bytes;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &dst.val, op_bytes, ctxt)) || 
-             (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset),
-                             &src.val, op_bytes, ctxt)) ||
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &dst.val, op_bytes, ctxt, ops)) ||
+             (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset),
+                              &src.val, op_bytes, ctxt, ops)) ||
              (rc = load_seg(x86_seg_cs, (uint16_t)src.val, ctxt, ops)) )
             goto done;
         _regs.eip = dst.val;
@@ -2729,12 +2771,12 @@ x86_emulate(
         if ( !mode_iopl() )
             mask |= EFLG_IF;
         fail_if(!in_realmode(ctxt, ops));
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &eip, op_bytes, ctxt)) ||
-             (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &cs, op_bytes, ctxt)) ||
-             (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &eflags, op_bytes, ctxt)) )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &eip, op_bytes, ctxt, ops)) ||
+             (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &cs, op_bytes, ctxt, ops)) ||
+             (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &eflags, op_bytes, ctxt, ops)) )
             goto done;
         if ( op_bytes == 2 )
             eflags = (uint16_t)eflags | (_regs.eflags & 0xffff0000u);
@@ -2779,12 +2821,64 @@ x86_emulate(
 
     case 0xd7: /* xlat */ {
         unsigned long al = (uint8_t)_regs.eax;
-        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.ebx + al),
-                             &al, 1, ctxt)) != 0 )
+        if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.ebx + al),
+                              &al, 1, ctxt, ops)) != 0 )
             goto done;
         *(uint8_t *)&_regs.eax = al;
         break;
     }
+
+    case 0xd8: /* FPU 0xd8 */
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* fadd %stN,%stN */
+        case 0xc8 ... 0xcf: /* fmul %stN,%stN */
+        case 0xd0 ... 0xd7: /* fcom %stN,%stN */
+        case 0xd8 ... 0xdf: /* fcomp %stN,%stN */
+        case 0xe0 ... 0xe7: /* fsub %stN,%stN */
+        case 0xe8 ... 0xef: /* fsubr %stN,%stN */
+        case 0xf0 ... 0xf7: /* fdiv %stN,%stN */
+        case 0xf8 ... 0xff: /* fdivr %stN,%stN */
+            emulate_fpu_insn_stub(0xd8, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            ea.bytes = 4;
+            src = ea;
+            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                 src.bytes, ctxt)) != 0 )
+                goto done;
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fadd */
+                emulate_fpu_insn_memsrc("fadds", src.val);
+                break;
+            case 1: /* fmul */
+                emulate_fpu_insn_memsrc("fmuls", src.val);
+                break;
+            case 2: /* fcom */
+                emulate_fpu_insn_memsrc("fcoms", src.val);
+                break;
+            case 3: /* fcomp */
+                emulate_fpu_insn_memsrc("fcomps", src.val);
+                break;
+            case 4: /* fsub */
+                emulate_fpu_insn_memsrc("fsubs", src.val);
+                break;
+            case 5: /* fsubr */
+                emulate_fpu_insn_memsrc("fsubrs", src.val);
+                break;
+            case 6: /* fdiv */
+                emulate_fpu_insn_memsrc("fdivs", src.val);
+                break;
+            case 7: /* fdivr */
+                emulate_fpu_insn_memsrc("fdivrs", src.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
+        break;
 
     case 0xd9: /* FPU 0xd9 */
         switch ( modrm )
@@ -2822,28 +2916,269 @@ x86_emulate(
             emulate_fpu_insn_stub(0xd9, modrm);
             break;
         default:
-            fail_if((modrm_reg & 7) != 7);
             fail_if(modrm >= 0xc0);
-            /* fnstcw m2byte */
-            ea.bytes = 2;
-            dst = ea;
-            emulate_fpu_insn_memdst("fnstcw", dst.val);
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fld m32fp */
+                ea.bytes = 4;
+                src = ea;
+                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("flds", src.val);
+                break;
+            case 2: /* fstp m32fp */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fsts", dst.val);
+                break;
+            case 3: /* fstp m32fp */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fstps", dst.val);
+                break;
+                /* case 4: fldenv - TODO */
+            case 5: /* fldcw m2byte */
+                ea.bytes = 2;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fldcw", src.val);
+                break;
+                /* case 6: fstenv - TODO */
+            case 7: /* fnstcw m2byte */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fnstcw", dst.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
+        break;
+
+    case 0xda: /* FPU 0xda */
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* fcmovb %stN */
+        case 0xc8 ... 0xcf: /* fcmove %stN */
+        case 0xd0 ... 0xd7: /* fcmovbe %stN */
+        case 0xd8 ... 0xdf: /* fcmovu %stN */
+        case 0xe9:          /* fucompp */
+            emulate_fpu_insn_stub(0xda, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            ea.bytes = 8;
+            src = ea;
+            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                 src.bytes, ctxt)) != 0 )
+                goto done;
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fiadd m64i */
+                emulate_fpu_insn_memsrc("fiaddl", src.val);
+                break;
+            case 1: /* fimul m64i */
+                emulate_fpu_insn_memsrc("fimul", src.val);
+                break;
+            case 2: /* ficom m64i */
+                emulate_fpu_insn_memsrc("ficoml", src.val);
+                break;
+            case 3: /* ficomp m64i */
+                emulate_fpu_insn_memsrc("ficompl", src.val);
+                break;
+            case 4: /* fisub m64i */
+                emulate_fpu_insn_memsrc("fisubl", src.val);
+                break;
+            case 5: /* fisubr m64i */
+                emulate_fpu_insn_memsrc("fisubrl", src.val);
+                break;
+            case 6: /* fidiv m64i */
+                emulate_fpu_insn_memsrc("fidivl", src.val);
+                break;
+            case 7: /* fidivr m64i */
+                emulate_fpu_insn_memsrc("fidivrl", src.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
         }
         break;
 
     case 0xdb: /* FPU 0xdb */
-        fail_if(modrm != 0xe3);
-        /* fninit */
-        emulate_fpu_insn("fninit");
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* fcmovnb %stN */
+        case 0xc8 ... 0xcf: /* fcmovne %stN */
+        case 0xd0 ... 0xd7: /* fcmovnbe %stN */
+        case 0xd8 ... 0xdf: /* fcmovnu %stN */
+            emulate_fpu_insn_stub(0xdb, modrm);
+            break;
+        case 0xe2: /* fnclex */
+            emulate_fpu_insn("fnclex");
+            break;
+        case 0xe3: /* fninit */
+            emulate_fpu_insn("fninit");
+            break;
+        case 0xe4: /* fsetpm - 287 only, ignored by 387 */
+            break;
+        case 0xe8 ... 0xef: /* fucomi %stN */
+        case 0xf0 ... 0xf7: /* fcomi %stN */
+            emulate_fpu_insn_stub(0xdb, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fild m32i */
+                ea.bytes = 4;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fildl", src.val);
+                break;
+            case 1: /* fisttp m32i */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fisttpl", dst.val);
+                break;
+            case 2: /* fist m32i */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fistl", dst.val);
+                break;
+            case 3: /* fistp m32i */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fistpl", dst.val);
+                break;
+            case 5: /* fld m80fp */
+                ea.bytes = 10;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off,
+                                     &src.val, src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memdst("fldt", src.val);
+                break;
+            case 7: /* fstp m80fp */
+                ea.bytes = 10;
+                dst.type = OP_MEM;
+                dst = ea;
+                emulate_fpu_insn_memdst("fstpt", dst.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
+        break;
+
+    case 0xdc: /* FPU 0xdc */
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* fadd %stN */
+        case 0xc8 ... 0xcf: /* fmul %stN */
+        case 0xe0 ... 0xe7: /* fsubr %stN */
+        case 0xe8 ... 0xef: /* fsub %stN */
+        case 0xf0 ... 0xf7: /* fdivr %stN */
+        case 0xf8 ... 0xff: /* fdiv %stN */
+            emulate_fpu_insn_stub(0xdc, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            ea.bytes = 8;
+            src = ea;
+            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                 src.bytes, ctxt)) != 0 )
+                goto done;
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fadd m64fp */
+                emulate_fpu_insn_memsrc("faddl", src.val);
+                break;
+            case 1: /* fmul m64fp */
+                emulate_fpu_insn_memsrc("fmull", src.val);
+                break;
+            case 2: /* fcom m64fp */
+                emulate_fpu_insn_memsrc("fcoml", src.val);
+                break;
+            case 3: /* fcomp m64fp */
+                emulate_fpu_insn_memsrc("fcompl", src.val);
+                break;
+            case 4: /* fsub m64fp */
+                emulate_fpu_insn_memsrc("fsubl", src.val);
+                break;
+            case 5: /* fsubr m64fp */
+                emulate_fpu_insn_memsrc("fsubrl", src.val);
+                break;
+            case 6: /* fdiv m64fp */
+                emulate_fpu_insn_memsrc("fdivl", src.val);
+                break;
+            case 7: /* fdivr m64fp */
+                emulate_fpu_insn_memsrc("fdivrl", src.val);
+                break;
+            }
+        }
         break;
 
     case 0xdd: /* FPU 0xdd */
-        fail_if((modrm_reg & 7) != 7);
-        fail_if(modrm >= 0xc0);
-        /* fnstsw m2byte */
-        ea.bytes = 2;
-        dst = ea;
-        emulate_fpu_insn_memdst("fnstsw", dst.val);
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* ffree %stN */
+        case 0xd0 ... 0xd7: /* fst %stN */
+        case 0xd8 ... 0xdf: /* fstp %stN */
+        case 0xe0 ... 0xe7: /* fucom %stN */
+        case 0xe8 ... 0xef: /* fucomp %stN */
+            emulate_fpu_insn_stub(0xdd, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fld m64fp */;
+                ea.bytes = 8;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fldl", src.val);
+                break;
+            case 1: /* fisttp m64i */
+                ea.bytes = 8;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fisttpll", dst.val);
+                break;
+            case 2: /* fst m64fp */
+                ea.bytes = 8;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memsrc("fstl", dst.val);
+                break;
+            case 3: /* fstp m64fp */
+                ea.bytes = 8;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fstpl", dst.val);
+                break;
+            case 7: /* fnstsw m2byte */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fnstsw", dst.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
         break;
 
     case 0xde: /* FPU 0xde */
@@ -2859,17 +3194,120 @@ x86_emulate(
             emulate_fpu_insn_stub(0xde, modrm);
             break;
         default:
-            goto cannot_emulate;
+            fail_if(modrm >= 0xc0);
+            ea.bytes = 2;
+            src = ea;
+            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                 src.bytes, ctxt)) != 0 )
+                goto done;
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fiadd m16i */
+                emulate_fpu_insn_memsrc("fiadd", src.val);
+                break;
+            case 1: /* fimul m16i */
+                emulate_fpu_insn_memsrc("fimul", src.val);
+                break;
+            case 2: /* ficom m16i */
+                emulate_fpu_insn_memsrc("ficom", src.val);
+                break;
+            case 3: /* ficomp m16i */
+                emulate_fpu_insn_memsrc("ficomp", src.val);
+                break;
+            case 4: /* fisub m16i */
+                emulate_fpu_insn_memsrc("fisub", src.val);
+                break;
+            case 5: /* fisubr m16i */
+                emulate_fpu_insn_memsrc("fisubr", src.val);
+                break;
+            case 6: /* fidiv m16i */
+                emulate_fpu_insn_memsrc("fidiv", src.val);
+                break;
+            case 7: /* fidivr m16i */
+                emulate_fpu_insn_memsrc("fidivr", src.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
         }
         break;
 
     case 0xdf: /* FPU 0xdf */
-        fail_if(modrm != 0xe0);
-        /* fnstsw %ax */
-        dst.bytes = 2;
-        dst.type = OP_REG;
-        dst.reg = (unsigned long *)&_regs.eax;
-        emulate_fpu_insn_memdst("fnstsw", dst.val);
+        switch ( modrm )
+        {
+        case 0xe0:
+            /* fnstsw %ax */
+            dst.bytes = 2;
+            dst.type = OP_REG;
+            dst.reg = (unsigned long *)&_regs.eax;
+            emulate_fpu_insn_memdst("fnstsw", dst.val);
+            break;
+        case 0xf0 ... 0xf7: /* fcomip %stN */
+        case 0xf8 ... 0xff: /* fucomip %stN */
+            emulate_fpu_insn_stub(0xdf, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fild m16i */
+                ea.bytes = 2;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fild", src.val);
+                break;
+            case 1: /* fisttp m16i */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fisttp", dst.val);
+                break;
+            case 2: /* fist m16i */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fist", dst.val);
+                break;
+            case 3: /* fistp m16i */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fistp", dst.val);
+                break;
+            case 4: /* fbld m80dec */
+                ea.bytes = 10;
+                dst = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off,
+                                     &src.val, src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memdst("fbld", src.val);
+                break;
+            case 5: /* fild m64i */
+                ea.bytes = 8;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fildll", src.val);
+                break;
+            case 6: /* fbstp packed bcd */
+                ea.bytes = 10;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fbstp", dst.val);
+                break;
+            case 7: /* fistp m64i */
+                ea.bytes = 8;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fistpll", dst.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
         break;
 
     case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
@@ -2924,7 +3362,6 @@ x86_emulate(
             /* out */
             fail_if(ops->write_io == NULL);
             rc = ops->write_io(port, op_bytes, _regs.eax, ctxt);
-            
         }
         else
         {
@@ -3242,9 +3679,9 @@ x86_emulate(
             if ( op_bytes == 2 )
                 reg.base &= 0xffffff;
             if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0,
-                                  reg.limit, 2, ctxt)) ||
+                                  &reg.limit, 2, ctxt)) ||
                  (rc = ops->write(ea.mem.seg, ea.mem.off+2,
-                                  reg.base, mode_64bit() ? 8 : 4, ctxt)) )
+                                  &reg.base, mode_64bit() ? 8 : 4, ctxt)) )
                 goto done;
             break;
         case 2: /* lgdt */
@@ -3252,10 +3689,10 @@ x86_emulate(
             generate_exception_if(ea.type != OP_MEM, EXC_UD, -1);
             fail_if(ops->write_segment == NULL);
             memset(&reg, 0, sizeof(reg));
-            if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0,
-                                 &limit, 2, ctxt)) ||
-                 (rc = ops->read(ea.mem.seg, ea.mem.off+2,
-                                 &base, mode_64bit() ? 8 : 4, ctxt)) )
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0,
+                                  &limit, 2, ctxt, ops)) ||
+                 (rc = read_ulong(ea.mem.seg, ea.mem.off+2,
+                                  &base, mode_64bit() ? 8 : 4, ctxt, ops)) )
                 goto done;
             reg.base = base;
             reg.limit = limit;
@@ -3267,7 +3704,8 @@ x86_emulate(
                 goto done;
             break;
         case 4: /* smsw */
-            ea.bytes = 2;
+            if ( ea.type == OP_MEM )
+                ea.bytes = 2;
             dst = ea;
             fail_if(ops->read_cr == NULL);
             if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
@@ -3281,11 +3719,11 @@ x86_emulate(
                 goto done;
             if ( ea.type == OP_REG )
                 cr0w = *ea.reg;
-            else if ( (rc = ops->read(ea.mem.seg, ea.mem.off,
-                                      &cr0w, 2, ctxt)) )
+            else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off,
+                                       &cr0w, 2, ctxt, ops)) )
                 goto done;
-            cr0 &= 0xffff0000;
-            cr0 |= (uint16_t)cr0w;
+            /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
+            cr0 = (cr0 & ~0xe) | (cr0w & 0xf);
             if ( (rc = ops->write_cr(0, cr0, ctxt)) )
                 goto done;
             break;
@@ -3404,8 +3842,10 @@ x86_emulate(
         if ( ea.type == OP_MEM )
         {
             unsigned long lval, hval;
-            if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) ||
-                 (rc = ops->read(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) )
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0,
+                                  &lval, 4, ctxt, ops)) ||
+                 (rc = read_ulong(ea.mem.seg, ea.mem.off+4,
+                                  &hval, 4, ctxt, ops)) )
                 goto done;
             val = ((uint64_t)hval << 32) | (uint32_t)lval;
             stub[2] = modrm & 0x38; /* movq (%eax),%mmN */
@@ -3428,8 +3868,8 @@ x86_emulate(
         if ( ea.type == OP_MEM )
         {
             unsigned long lval = (uint32_t)val, hval = (uint32_t)(val >> 32);
-            if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, lval, 4, ctxt)) ||
-                 (rc = ops->write(ea.mem.seg, ea.mem.off+4, hval, 4, ctxt)) )
+            if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) ||
+                 (rc = ops->write(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) )
                 goto done;
         }
         break;
@@ -3481,8 +3921,8 @@ x86_emulate(
 
         /* Get actual old value. */
         for ( i = 0; i < (op_bytes/sizeof(long)); i++ )
-            if ( (rc = ops->read(ea.mem.seg, ea.mem.off + i*sizeof(long),
-                                 &old[i], sizeof(long), ctxt)) != 0 )
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off + i*sizeof(long),
+                                  &old[i], sizeof(long), ctxt, ops)) != 0 )
                 goto done;
 
         /* Get expected and proposed values. */
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/x86_emulate/x86_emulate.h
--- a/xen/arch/x86/x86_emulate/x86_emulate.h    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h    Wed Jul 02 11:30:37 2008 +0900
@@ -102,7 +102,8 @@ enum x86_emulate_fpu_type {
 };
 
 /*
- * These operations represent the instruction emulator's interface to memory.
+ * These operations represent the instruction emulator's interface to memory,
+ * I/O ports, privileged state... pretty much everything other than GPRs.
  * 
  * NOTES:
  *  1. If the access fails (cannot emulate, or a standard access faults) then
@@ -110,8 +111,7 @@ enum x86_emulate_fpu_type {
  *     some out-of-band mechanism, unknown to the emulator. The memop signals
  *     failure by returning X86EMUL_EXCEPTION to the emulator, which will
  *     then immediately bail.
- *  2. Valid access sizes are 1, 2, 4 and 8 (x86/64 only) bytes.
- *  3. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ *  2. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
  */
 struct x86_emulate_ops
 {
@@ -121,19 +121,25 @@ struct x86_emulate_ops
      * All memory-access functions:
      *  @seg:   [IN ] Segment being dereferenced (specified as x86_seg_??).
      *  @offset:[IN ] Offset within segment.
+     *  @p_data:[IN ] Pointer to i/o data buffer (length is @bytes)
      * Read functions:
      *  @val:   [OUT] Value read, zero-extended to 'ulong'.
      * Write functions:
      *  @val:   [IN ] Value to write (low-order bytes used as req'd).
      * Variable-length access functions:
-     *  @bytes: [IN ] Number of bytes to read or write.
-     */
-
-    /* read: Emulate a memory read. */
+     *  @bytes: [IN ] Number of bytes to read or write. Valid access sizes are
+     *                1, 2, 4 and 8 (x86/64 only) bytes, unless otherwise
+     *                stated.
+     */
+
+    /*
+     * read: Emulate a memory read.
+     *  @bytes: Access length (0 < @bytes < 4096).
+     */
     int (*read)(
         enum x86_segment seg,
         unsigned long offset,
-        unsigned long *val,
+        void *p_data,
         unsigned int bytes,
         struct x86_emulate_ctxt *ctxt);
 
@@ -144,15 +150,18 @@ struct x86_emulate_ops
     int (*insn_fetch)(
         enum x86_segment seg,
         unsigned long offset,
-        unsigned long *val,
-        unsigned int bytes,
-        struct x86_emulate_ctxt *ctxt);
-
-    /* write: Emulate a memory write. */
+        void *p_data,
+        unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
+     * write: Emulate a memory write.
+     *  @bytes: Access length (0 < @bytes < 4096).
+     */
     int (*write)(
         enum x86_segment seg,
         unsigned long offset,
-        unsigned long val,
+        void *p_data,
         unsigned int bytes,
         struct x86_emulate_ctxt *ctxt);
 
diff -r 11318234588e -r 08f77df14cba xen/common/domain.c
--- a/xen/common/domain.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/common/domain.c       Wed Jul 02 11:30:37 2008 +0900
@@ -73,21 +73,133 @@ int current_domain_id(void)
     return current->domain->domain_id;
 }
 
-struct domain *alloc_domain(domid_t domid)
+static struct domain *alloc_domain_struct(void)
+{
+    return xmalloc(struct domain);
+}
+
+static void free_domain_struct(struct domain *d)
+{
+    xfree(d);
+}
+
+static void __domain_finalise_shutdown(struct domain *d)
+{
+    struct vcpu *v;
+
+    BUG_ON(!spin_is_locked(&d->shutdown_lock));
+
+    if ( d->is_shut_down )
+        return;
+
+    for_each_vcpu ( d, v )
+        if ( !v->paused_for_shutdown )
+            return;
+
+    d->is_shut_down = 1;
+    send_guest_global_virq(dom0, VIRQ_DOM_EXC);
+}
+
+static void vcpu_check_shutdown(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+
+    spin_lock(&d->shutdown_lock);
+
+    if ( d->is_shutting_down )
+    {
+        if ( !v->paused_for_shutdown )
+            vcpu_pause_nosync(v);
+        v->paused_for_shutdown = 1;
+        v->defer_shutdown = 0;
+        __domain_finalise_shutdown(d);
+    }
+
+    spin_unlock(&d->shutdown_lock);
+}
+
+struct vcpu *alloc_vcpu(
+    struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
+{
+    struct vcpu *v;
+
+    BUG_ON(d->vcpu[vcpu_id] != NULL);
+
+    if ( (v = alloc_vcpu_struct()) == NULL )
+        return NULL;
+
+    v->domain = d;
+    v->vcpu_id = vcpu_id;
+
+    v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
+    v->runstate.state_entry_time = NOW();
+
+    if ( !is_idle_domain(d) )
+    {
+        set_bit(_VPF_down, &v->pause_flags);
+        v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]);
+    }
+
+    if ( sched_init_vcpu(v, cpu_id) != 0 )
+    {
+        free_vcpu_struct(v);
+        return NULL;
+    }
+
+    if ( vcpu_initialise(v) != 0 )
+    {
+        sched_destroy_vcpu(v);
+        free_vcpu_struct(v);
+        return NULL;
+    }
+
+    d->vcpu[vcpu_id] = v;
+    if ( vcpu_id != 0 )
+        d->vcpu[v->vcpu_id-1]->next_in_list = v;
+
+    /* Must be called after making new vcpu visible to for_each_vcpu(). */
+    vcpu_check_shutdown(v);
+
+    return v;
+}
+
+struct vcpu *alloc_idle_vcpu(unsigned int cpu_id)
 {
     struct domain *d;
-
-    if ( (d = xmalloc(struct domain)) == NULL )
+    struct vcpu *v;
+    unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
+
+    if ( (v = idle_vcpu[cpu_id]) != NULL )
+        return v;
+
+    d = (vcpu_id == 0) ?
+        domain_create(IDLE_DOMAIN_ID, 0, 0) :
+        idle_vcpu[cpu_id - vcpu_id]->domain;
+    BUG_ON(d == NULL);
+
+    v = alloc_vcpu(d, vcpu_id, cpu_id);
+    idle_vcpu[cpu_id] = v;
+
+    return v;
+}
+
+struct domain *domain_create(
+    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
+{
+    struct domain *d, **pd;
+    enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
+           INIT_gnttab = 1u<<3, INIT_arch = 1u<<4 };
+    int init_status = 0;
+
+    if ( (d = alloc_domain_struct()) == NULL )
         return NULL;
 
     memset(d, 0, sizeof(*d));
     d->domain_id = domid;
 
     if ( xsm_alloc_security_domain(d) != 0 )
-    {
-        free_domain(d);
-        return NULL;
-    }
+        goto fail;
+    init_status |= INIT_xsm;
 
     atomic_set(&d->refcnt, 1);
     spin_lock_init(&d->domain_lock);
@@ -97,132 +209,17 @@ struct domain *alloc_domain(domid_t domi
     INIT_LIST_HEAD(&d->page_list);
     INIT_LIST_HEAD(&d->xenpage_list);
 
-    return d;
-}
-
-void free_domain(struct domain *d)
-{
-    xsm_free_security_domain(d);
-    xfree(d);
-}
-
-static void __domain_finalise_shutdown(struct domain *d)
-{
-    struct vcpu *v;
-
-    BUG_ON(!spin_is_locked(&d->shutdown_lock));
-
-    if ( d->is_shut_down )
-        return;
-
-    for_each_vcpu ( d, v )
-        if ( !v->paused_for_shutdown )
-            return;
-
-    d->is_shut_down = 1;
-    send_guest_global_virq(dom0, VIRQ_DOM_EXC);
-}
-
-static void vcpu_check_shutdown(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-
-    spin_lock(&d->shutdown_lock);
-
-    if ( d->is_shutting_down )
-    {
-        if ( !v->paused_for_shutdown )
-            vcpu_pause_nosync(v);
-        v->paused_for_shutdown = 1;
-        v->defer_shutdown = 0;
-        __domain_finalise_shutdown(d);
-    }
-
-    spin_unlock(&d->shutdown_lock);
-}
-
-struct vcpu *alloc_vcpu(
-    struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
-{
-    struct vcpu *v;
-
-    BUG_ON(d->vcpu[vcpu_id] != NULL);
-
-    if ( (v = alloc_vcpu_struct()) == NULL )
-        return NULL;
-
-    v->domain = d;
-    v->vcpu_id = vcpu_id;
-
-    v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
-    v->runstate.state_entry_time = NOW();
-
-    if ( !is_idle_domain(d) )
-    {
-        set_bit(_VPF_down, &v->pause_flags);
-        v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]);
-    }
-
-    if ( sched_init_vcpu(v, cpu_id) != 0 )
-    {
-        free_vcpu_struct(v);
-        return NULL;
-    }
-
-    if ( vcpu_initialise(v) != 0 )
-    {
-        sched_destroy_vcpu(v);
-        free_vcpu_struct(v);
-        return NULL;
-    }
-
-    d->vcpu[vcpu_id] = v;
-    if ( vcpu_id != 0 )
-        d->vcpu[v->vcpu_id-1]->next_in_list = v;
-
-    /* Must be called after making new vcpu visible to for_each_vcpu(). */
-    vcpu_check_shutdown(v);
-
-    return v;
-}
-
-struct vcpu *alloc_idle_vcpu(unsigned int cpu_id)
-{
-    struct domain *d;
-    struct vcpu *v;
-    unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
-
-    if ( (v = idle_vcpu[cpu_id]) != NULL )
-        return v;
-
-    d = (vcpu_id == 0) ?
-        domain_create(IDLE_DOMAIN_ID, 0, 0) :
-        idle_vcpu[cpu_id - vcpu_id]->domain;
-    BUG_ON(d == NULL);
-
-    v = alloc_vcpu(d, vcpu_id, cpu_id);
-    idle_vcpu[cpu_id] = v;
-
-    return v;
-}
-
-struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
-{
-    struct domain *d, **pd;
-    enum { INIT_evtchn = 1, INIT_gnttab = 2, INIT_arch = 8 }; 
-    int init_status = 0;
-
-    if ( (d = alloc_domain(domid)) == NULL )
-        return NULL;
-
     if ( domcr_flags & DOMCRF_hvm )
         d->is_hvm = 1;
 
     if ( (domid == 0) && opt_dom0_vcpus_pin )
         d->is_pinned = 1;
 
+    if ( domcr_flags & DOMCRF_dummy )
+        return d;
+
     rangeset_domain_initialise(d);
+    init_status |= INIT_rangeset;
 
     if ( !is_idle_domain(d) )
     {
@@ -278,8 +275,11 @@ struct domain *domain_create(
         grant_table_destroy(d);
     if ( init_status & INIT_evtchn )
         evtchn_destroy(d);
-    rangeset_domain_destroy(d);
-    free_domain(d);
+    if ( init_status & INIT_rangeset )
+        rangeset_domain_destroy(d);
+    if ( init_status & INIT_xsm )
+        xsm_free_security_domain(d);
+    free_domain_struct(d);
     return NULL;
 }
 
@@ -535,7 +535,8 @@ static void complete_domain_destroy(stru
     if ( d->target != NULL )
         put_domain(d->target);
 
-    free_domain(d);
+    xsm_free_security_domain(d);
+    free_domain_struct(d);
 
     send_guest_global_virq(dom0, VIRQ_DOM_EXC);
 }
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/dmar.c
--- a/xen/drivers/passthrough/vtd/dmar.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/dmar.c        Wed Jul 02 11:30:37 2008 +0900
@@ -383,7 +383,8 @@ acpi_parse_one_drhd(struct acpi_dmar_ent
     dmaru->address = drhd->address;
     dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */
     INIT_LIST_HEAD(&dmaru->ioapic_list);
-    dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %lx\n", dmaru->address);
+    dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n",
+            dmaru->address);
 
     dev_scope_start = (void *)(drhd + 1);
     dev_scope_end   = ((void *)drhd) + header->length;
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/dmar.h
--- a/xen/drivers/passthrough/vtd/dmar.h        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/dmar.h        Wed Jul 02 11:30:37 2008 +0900
@@ -42,28 +42,28 @@ struct acpi_ioapic_unit {
 
 struct acpi_drhd_unit {
     struct list_head list;
-    unsigned long    address; /* register base address of the unit */
-    struct    pci_dev *devices; /* target devices */
+    u64    address; /* register base address of the unit */
+    struct pci_dev *devices; /* target devices */
     int    devices_cnt;
-    u8    include_all:1;
+    u8     include_all:1;
     struct iommu *iommu;
     struct list_head ioapic_list;
 };
 
 struct acpi_rmrr_unit {
     struct list_head list;
-    unsigned long base_address;
-    unsigned long end_address;
+    u64    base_address;
+    u64    end_address;
     struct pci_dev *devices; /* target devices */
     int    devices_cnt;
-    u8    allow_all:1;
+    u8     allow_all:1;
 };
 
 struct acpi_atsr_unit {
     struct list_head list;
-    struct    pci_dev *devices; /* target devices */
+    struct pci_dev *devices; /* target devices */
     int    devices_cnt;
-    u8    all_ports:1;
+    u8     all_ports:1;
 };
 
 #define for_each_iommu(domain, iommu) \
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/intremap.c
--- a/xen/drivers/passthrough/vtd/intremap.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/intremap.c    Wed Jul 02 11:30:37 2008 +0900
@@ -52,7 +52,7 @@ static void remap_entry_to_ioapic_rte(
     unsigned long flags;
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
 
-    if ( ir_ctrl == NULL || ir_ctrl->iremap_index < 0 )
+    if ( ir_ctrl == NULL )
     {
         dprintk(XENLOG_ERR VTDPREFIX,
                 "remap_entry_to_ioapic_rte: ir_ctl is not ready\n");
@@ -153,6 +153,7 @@ static void ioapic_rte_to_remap_entry(st
     }
 
     memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+    iommu_flush_cache_entry(iremap_entry);
     iommu_flush_iec_index(iommu, 0, index);
     invalidate_sync(iommu);
 
@@ -170,7 +171,8 @@ unsigned int io_apic_read_remap_rte(
     struct iommu *iommu = ioapic_to_iommu(mp_ioapics[apic].mpc_apicid);
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
 
-    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 )
+    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ||
+         ir_ctrl->iremap_index == -1 )
     {
         *IO_APIC_BASE(apic) = reg;
         return *(IO_APIC_BASE(apic)+4);
@@ -377,6 +379,7 @@ static void msi_msg_to_remap_entry(
     remap_rte->data = 0;
 
     memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+    iommu_flush_cache_entry(iremap_entry);
     iommu_flush_iec_index(iommu, 0, index);
     invalidate_sync(iommu);
 
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/iommu.c       Wed Jul 02 11:30:37 2008 +0900
@@ -1269,7 +1269,6 @@ static int domain_context_mapping(
 }
 
 static int domain_context_unmap_one(
-    struct domain *domain,
     struct iommu *iommu,
     u8 bus, u8 devfn)
 {
@@ -1300,7 +1299,6 @@ static int domain_context_unmap_one(
 }
 
 static int domain_context_unmap(
-    struct domain *domain,
     struct iommu *iommu,
     struct pci_dev *pdev)
 {
@@ -1320,14 +1318,13 @@ static int domain_context_unmap(
             PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS);
         break;
     case DEV_TYPE_PCIe_ENDPOINT:
-        ret = domain_context_unmap_one(domain, iommu,
+        ret = domain_context_unmap_one(iommu,
                                        (u8)(pdev->bus), (u8)(pdev->devfn));
         break;
     case DEV_TYPE_PCI:
         if ( pdev->bus == 0 )
             ret = domain_context_unmap_one(
-                domain, iommu,
-                (u8)(pdev->bus), (u8)(pdev->devfn));
+                iommu, (u8)(pdev->bus), (u8)(pdev->devfn));
         else
         {
             if ( bus2bridge[pdev->bus].bus != 0 )
@@ -1335,7 +1332,7 @@ static int domain_context_unmap(
                          "domain_context_unmap:"
                          "bus2bridge[%d].bus != 0\n", pdev->bus);
 
-            ret = domain_context_unmap_one(domain, iommu,
+            ret = domain_context_unmap_one(iommu,
                                            (u8)(bus2bridge[pdev->bus].bus),
                                            (u8)(bus2bridge[pdev->bus].devfn));
 
@@ -1345,8 +1342,7 @@ static int domain_context_unmap(
                 for ( func = 0; func < 8; func++ )
                 {
                     ret = domain_context_unmap_one(
-                        domain, iommu,
-                        pdev->bus, (u8)PCI_DEVFN(dev, func));
+                        iommu, pdev->bus, (u8)PCI_DEVFN(dev, func));
                     if ( ret )
                         return ret;
                 }
@@ -1389,7 +1385,7 @@ void reassign_device_ownership(
  found:
     drhd = acpi_find_matched_drhd_unit(pdev);
     iommu = drhd->iommu;
-    domain_context_unmap(source, iommu, pdev);
+    domain_context_unmap(iommu, pdev);
 
     /* Move pci device from the source domain to target domain. */
     spin_lock_irqsave(&source_hd->iommu_list_lock, flags);
@@ -1589,7 +1585,7 @@ static int iommu_prepare_rmrr_dev(
     struct pci_dev *pdev)
 {
     struct acpi_drhd_unit *drhd;
-    unsigned long size;
+    u64 size;
     int ret;
 
     /* page table init */
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/qinval.c
--- a/xen/drivers/passthrough/vtd/qinval.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/qinval.c      Wed Jul 02 11:30:37 2008 +0900
@@ -222,7 +222,7 @@ int invalidate_sync(struct iommu *iommu)
     int ret = -1;
     struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
 
-    if ( qi_ctrl->qinval_maddr == 0 )
+    if ( qi_ctrl->qinval_maddr != 0 )
     {
         ret = queue_invalidate_wait(iommu,
             0, 1, 1, 1, &qi_ctrl->qinval_poll_status);
@@ -416,7 +416,6 @@ int qinval_setup(struct iommu *iommu)
 int qinval_setup(struct iommu *iommu)
 {
     s_time_t start_time;
-    u32 status = 0;
     struct qi_ctrl *qi_ctrl;
     struct iommu_flush *flush;
 
@@ -450,15 +449,12 @@ int qinval_setup(struct iommu *iommu)
 
     /* Make sure hardware complete it */
     start_time = NOW();
-    for ( ; ; )
-    {
-        status = dmar_readl(iommu->reg, DMAR_GSTS_REG);
-        if ( status & DMA_GSTS_QIES )
-            break;
+    while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_QIES) )
+    {
         if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
             panic("Cannot set QIE field for queue invalidation\n");
         cpu_relax();
     }
-    status = 0;
-    return status;
-}
+
+    return 0;
+}
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/utils.c
--- a/xen/drivers/passthrough/vtd/utils.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/utils.c       Wed Jul 02 11:30:37 2008 +0900
@@ -166,7 +166,7 @@ void print_iommu_regs(struct acpi_drhd_u
     struct iommu *iommu = drhd->iommu;
 
     printk("---- print_iommu_regs ----\n");
-    printk("print_iommu_regs: drhd->address = %lx\n", drhd->address);
+    printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address);
     printk("print_iommu_regs: DMAR_VER_REG = %x\n",
            dmar_readl(iommu->reg,DMAR_VER_REG));
     printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n",
diff -r 11318234588e -r 08f77df14cba xen/include/acpi/cpufreq/cpufreq.h
--- a/xen/include/acpi/cpufreq/cpufreq.h        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/acpi/cpufreq/cpufreq.h        Wed Jul 02 11:30:37 2008 +0900
@@ -36,7 +36,10 @@ struct cpufreq_policy {
     unsigned int        max;    /* in kHz */
     unsigned int        cur;    /* in kHz, only needed if cpufreq
                                  * governors are used */
+    unsigned int        resume; /* flag for cpufreq 1st run
+                                 * S3 wakeup, hotplug cpu, etc */
 };
+extern struct cpufreq_policy xen_px_policy[NR_CPUS];
 
 #define CPUFREQ_SHARED_TYPE_NONE (0) /* None */
 #define CPUFREQ_SHARED_TYPE_HW   (1) /* HW does needed coordination */
diff -r 11318234588e -r 08f77df14cba xen/include/acpi/cpufreq/processor_perf.h
--- a/xen/include/acpi/cpufreq/processor_perf.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/acpi/cpufreq/processor_perf.h Wed Jul 02 11:30:37 2008 +0900
@@ -6,9 +6,21 @@
 
 int get_cpu_id(u8);
 int acpi_cpufreq_init(void);
+int powernow_cpufreq_init(void);
+
 void px_statistic_update(cpumask_t, uint8_t, uint8_t);
 int  px_statistic_init(int);
 void px_statistic_reset(int);
+void px_statistic_suspend(void);
+void px_statistic_resume(void);
+
+void cpufreq_dom_exit(void);
+int  cpufreq_dom_init(void);
+int  cpufreq_dom_dbs(unsigned int);
+void cpufreq_suspend(void);
+int  cpufreq_resume(void);
+
+inline uint64_t get_cpu_idle_time(unsigned int);
 
 struct processor_performance {
     uint32_t state;
@@ -44,6 +56,7 @@ struct pm_px {
 struct pm_px {
     struct px_stat u;
     uint64_t prev_state_wall;
+    uint64_t prev_idle_wall;
 };
 
 extern struct pm_px px_statistic_data[NR_CPUS];
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/domain.h      Wed Jul 02 11:30:37 2008 +0900
@@ -103,6 +103,9 @@ struct shadow_domain {
      * emulation and remove write permission
      */
     atomic_t          gtable_dirty_version;
+
+    /* OOS */
+    int oos_active;
 };
 
 struct shadow_vcpu {
@@ -122,6 +125,17 @@ struct shadow_vcpu {
     unsigned long last_emulated_frame;
     /* Last MFN that we emulated a write successfully */
     unsigned long last_emulated_mfn;
+
+    /* Shadow out-of-sync: pages that this vcpu has let go out of sync */
+    mfn_t oos[SHADOW_OOS_PAGES];
+    unsigned long oos_va[SHADOW_OOS_PAGES];
+    mfn_t oos_snapshot[SHADOW_OOS_PAGES];
+    struct oos_fixup {
+        mfn_t gmfn;
+        mfn_t smfn;
+        unsigned long off;
+    } *oos_fixups;
+    int oos_fixup_used;
 };
 
 /************************************************/
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h        Wed Jul 02 11:30:37 2008 +0900
@@ -333,10 +333,10 @@ enum vmcs_field {
 #define VMCS_VPID_WIDTH 16
 
 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr);
-int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val);
-int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val);
-int vmx_add_guest_msr(struct vcpu *v, u32 msr);
-int vmx_add_host_load_msr(struct vcpu *v, u32 msr);
+int vmx_read_guest_msr(u32 msr, u64 *val);
+int vmx_write_guest_msr(u32 msr, u64 val);
+int vmx_add_guest_msr(u32 msr);
+int vmx_add_host_load_msr(u32 msr);
 
 #endif /* ASM_X86_HVM_VMX_VMCS_H__ */
 
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/mm.h  Wed Jul 02 11:30:37 2008 +0900
@@ -130,6 +130,14 @@ static inline u32 pickle_domptr(struct d
 /* The order of the largest allocation unit we use for shadow pages */
 #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 
+/* The number of out-of-sync shadows we allow per vcpu (prime, please) */
+#define SHADOW_OOS_PAGES 3
+
+/* The order OOS fixup tables per vcpu */
+#define SHADOW_OOS_FT_ORDER 1
+/* OOS fixup tables hash entries */
+#define SHADOW_OOS_FT_HASH 13
+
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/perfc_defn.h  Wed Jul 02 11:30:37 2008 +0900
@@ -80,7 +80,11 @@ PERFCOUNTER(shadow_writeable_h_3,  "shad
 PERFCOUNTER(shadow_writeable_h_3,  "shadow writeable: 64b w2k3")
 PERFCOUNTER(shadow_writeable_h_4,  "shadow writeable: linux low/solaris")
 PERFCOUNTER(shadow_writeable_h_5,  "shadow writeable: linux high")
+PERFCOUNTER(shadow_writeable_h_6,  "shadow writeable: unsync va")
+PERFCOUNTER(shadow_writeable_h_7,  "shadow writeable: sl1p")
+PERFCOUNTER(shadow_writeable_h_8,  "shadow writeable: sl1p failed")
 PERFCOUNTER(shadow_writeable_bf,   "shadow writeable brute-force")
+PERFCOUNTER(shadow_writeable_bf_1, "shadow writeable resync bf")
 PERFCOUNTER(shadow_mappings,       "shadow removes all mappings")
 PERFCOUNTER(shadow_mappings_bf,    "shadow rm-mappings brute-force")
 PERFCOUNTER(shadow_early_unshadow, "shadow unshadows for fork/exit")
@@ -101,4 +105,15 @@ PERFCOUNTER(shadow_em_ex_non_pt,   "shad
 PERFCOUNTER(shadow_em_ex_non_pt,   "shadow extra non-pt-write op")
 PERFCOUNTER(shadow_em_ex_fail,     "shadow extra emulation failed")
 
+PERFCOUNTER(shadow_oos_fixup_add_ok,    "shadow OOS fixups adds")
+PERFCOUNTER(shadow_oos_fixup_no_add,    "shadow OOS fixups no adds")
+PERFCOUNTER(shadow_oos_fixup_add_fail,  "shadow OOS fixups adds failed")
+PERFCOUNTER(shadow_oos_fixup_remove,    "shadow OOS fixups removes")
+PERFCOUNTER(shadow_oos_fixup_flush,     "shadow OOS fixups flushes")
+PERFCOUNTER(shadow_oos_fixup_flush_gmfn,"shadow OOS fixups gmfn flushes")
+
+PERFCOUNTER(shadow_unsync,         "shadow OOS unsyncs")
+PERFCOUNTER(shadow_unsync_evict,   "shadow OOS evictions")
+PERFCOUNTER(shadow_resync,         "shadow OOS resyncs")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r 11318234588e -r 08f77df14cba xen/include/public/hvm/hvm_op.h
--- a/xen/include/public/hvm/hvm_op.h   Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/public/hvm/hvm_op.h   Wed Jul 02 11:30:37 2008 +0900
@@ -92,6 +92,19 @@ typedef struct xen_hvm_track_dirty_vram 
 typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t;
 DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t);
 
+/* Notify that some pages got modified by the Device Model. */
+#define HVMOP_modified_memory    7
+struct xen_hvm_modified_memory {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* First pfn. */
+    uint64_aligned_t first_pfn;
+    /* Number of pages. */
+    uint64_aligned_t nr;
+};
+typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t);
+
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff -r 11318234588e -r 08f77df14cba xen/include/xen/domain.h
--- a/xen/include/xen/domain.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/xen/domain.h  Wed Jul 02 11:30:37 2008 +0900
@@ -15,9 +15,6 @@ int boot_vcpu(
     struct domain *d, int vcpuid, vcpu_guest_context_u ctxt);
 struct vcpu *alloc_idle_vcpu(unsigned int cpu_id);
 void vcpu_reset(struct vcpu *v);
-
-struct domain *alloc_domain(domid_t domid);
-void free_domain(struct domain *d);
 
 struct xen_domctl_getdomaininfo;
 void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info);
diff -r 11318234588e -r 08f77df14cba xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/xen/sched.h   Wed Jul 02 11:30:37 2008 +0900
@@ -315,10 +315,14 @@ struct domain *domain_create(
 struct domain *domain_create(
     domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
-#define _DOMCRF_hvm 0
-#define DOMCRF_hvm  (1U<<_DOMCRF_hvm)
-#define _DOMCRF_hap 1
-#define DOMCRF_hap  (1U<<_DOMCRF_hap)
+#define _DOMCRF_hvm   0
+#define DOMCRF_hvm    (1U<<_DOMCRF_hvm)
+ /* DOMCRF_hap: Create a domain with hardware-assisted paging. */
+#define _DOMCRF_hap   1
+#define DOMCRF_hap    (1U<<_DOMCRF_hap)
+ /* DOMCRF_dummy: Create a dummy domain (not scheduled; not on domain list) */
+#define _DOMCRF_dummy 2
+#define DOMCRF_dummy  (1U<<_DOMCRF_dummy)
 
 int construct_dom0(
     struct domain *d,

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>