WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] merge with xen-unstable.hg

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] merge with xen-unstable.hg
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Mon, 01 Dec 2008 03:50:37 -0800
Delivery-date: Mon, 01 Dec 2008 03:52:38 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
# Date 1225770199 -32400
# Node ID e75cb35c798beabee0b0ed4025ef82a39c702279
# Parent  10f0e1bb8e5e9a28e1ebe3fbb9291fb8114ef4bc
# Parent  43a079fd50fdab01cd2be443bfef011b3b0495ae
merge with xen-unstable.hg
---
 xen/common/xmalloc.c                          |  286 --------------
 .hgignore                                     |    1 
 extras/mini-os/include/sched.h                |    3 
 extras/mini-os/include/wait.h                 |   10 
 extras/mini-os/minios.mk                      |    3 
 tools/Makefile                                |    1 
 tools/blktap/drivers/block-qcow.c             |   24 -
 tools/firmware/hvmloader/acpi/static_tables.c |    2 
 tools/firmware/rombios/rombios.c              |    4 
 tools/flask/policy/policy/modules/xen/xen.te  |    3 
 tools/python/xen/util/diagnose.py             |    4 
 tools/python/xen/xend/XendConfig.py           |   17 
 tools/python/xen/xend/XendDomainInfo.py       |   73 ++-
 tools/python/xen/xend/server/DevConstants.py  |   45 ++
 tools/python/xen/xend/server/DevController.py |   31 -
 tools/python/xen/xend/server/iopif.py         |   20 -
 tools/python/xen/xend/server/irqif.py         |   19 
 tools/python/xen/xend/server/pciif.py         |    3 
 tools/python/xen/xend/server/vscsiif.py       |   15 
 tools/python/xen/xm/create.py                 |   14 
 tools/python/xen/xm/main.py                   |    5 
 tools/xenpmd/Makefile                         |   20 +
 tools/xenpmd/xenpmd.c                         |  520 ++++++++++++++++++++++++++
 xen/arch/ia64/xen/cpufreq/cpufreq.c           |   15 
 xen/arch/ia64/xen/irq.c                       |    2 
 xen/arch/x86/acpi/cpu_idle.c                  |  103 ++---
 xen/arch/x86/acpi/cpufreq/cpufreq.c           |   14 
 xen/arch/x86/acpi/cpufreq/powernow.c          |   14 
 xen/arch/x86/acpi/cpuidle_menu.c              |   14 
 xen/arch/x86/domain.c                         |  116 ++++-
 xen/arch/x86/domain_build.c                   |   34 +
 xen/arch/x86/hpet.c                           |    7 
 xen/arch/x86/hvm/emulate.c                    |   30 +
 xen/arch/x86/hvm/hpet.c                       |  339 +++++++++-------
 xen/arch/x86/hvm/hvm.c                        |    1 
 xen/arch/x86/hvm/i8254.c                      |    4 
 xen/arch/x86/hvm/rtc.c                        |    4 
 xen/arch/x86/hvm/svm/entry.S                  |    3 
 xen/arch/x86/hvm/vlapic.c                     |   10 
 xen/arch/x86/hvm/vmx/entry.S                  |    6 
 xen/arch/x86/hvm/vmx/vmx.c                    |   81 ++--
 xen/arch/x86/hvm/vmx/vpmu_core2.c             |   20 +
 xen/arch/x86/hvm/vpt.c                        |   18 
 xen/arch/x86/irq.c                            |    6 
 xen/arch/x86/mm.c                             |  251 +++++++++---
 xen/arch/x86/mm/hap/p2m-ept.c                 |    8 
 xen/arch/x86/mm/p2m.c                         |   17 
 xen/arch/x86/msi.c                            |   69 +--
 xen/arch/x86/oprofile/nmi_int.c               |   51 ++
 xen/arch/x86/oprofile/op_model_ppro.c         |  103 +++++
 xen/arch/x86/oprofile/op_x86_model.h          |    5 
 xen/arch/x86/setup.c                          |    1 
 xen/arch/x86/smpboot.c                        |   14 
 xen/arch/x86/time.c                           |    4 
 xen/arch/x86/traps.c                          |   29 -
 xen/arch/x86/x86_32/domain_page.c             |   10 
 xen/arch/x86/x86_64/compat/mm.c               |    5 
 xen/arch/x86/x86_64/cpufreq.c                 |   33 -
 xen/common/event_channel.c                    |    2 
 xen/common/kernel.c                           |    3 
 xen/common/keyhandler.c                       |    4 
 xen/common/spinlock.c                         |   69 +++
 xen/common/timer.c                            |  125 +++---
 xen/common/xenoprof.c                         |    2 
 xen/drivers/char/serial.c                     |    7 
 xen/drivers/cpufreq/cpufreq.c                 |  149 ++++++-
 xen/include/asm-x86/config.h                  |    8 
 xen/include/asm-x86/event.h                   |   32 -
 xen/include/asm-x86/fixmap.h                  |    1 
 xen/include/asm-x86/hvm/vmx/vpmu.h            |    2 
 xen/include/asm-x86/hvm/vmx/vpmu_core2.h      |   22 -
 xen/include/asm-x86/hvm/vpt.h                 |   70 +--
 xen/include/asm-x86/mm.h                      |   30 +
 xen/include/asm-x86/page.h                    |    3 
 xen/include/asm-x86/softirq.h                 |    3 
 xen/include/asm-x86/x86_32/page.h             |    3 
 xen/include/asm-x86/x86_64/page.h             |    5 
 xen/include/asm-x86/xenoprof.h                |    3 
 xen/include/public/features.h                 |    3 
 xen/include/public/trace.h                    |    2 
 xen/include/public/xen.h                      |   14 
 xen/include/xen/cpuidle.h                     |    8 
 xen/include/xen/domain_page.h                 |    6 
 xen/include/xen/spinlock.h                    |   23 +
 xen/include/xen/time.h                        |    1 
 xen/include/xen/timer.h                       |    3 
 xen/include/xlat.lst                          |    2 
 87 files changed, 2085 insertions(+), 1084 deletions(-)

diff -r 10f0e1bb8e5e -r e75cb35c798b .hgignore
--- a/.hgignore Tue Nov 04 12:07:22 2008 +0900
+++ b/.hgignore Tue Nov 04 12:43:19 2008 +0900
@@ -211,6 +211,7 @@
 ^tools/xenfb/vncfb$
 ^tools/xenmon/xentrace_setmask$
 ^tools/xenmon/xenbaked$
+^tools/xenpmd/xenpmd$
 ^tools/xenstat/xentop/xentop$
 ^tools/xenstore/testsuite/tmp/.*$
 ^tools/xenstore/xen$
diff -r 10f0e1bb8e5e -r e75cb35c798b extras/mini-os/include/sched.h
--- a/extras/mini-os/include/sched.h    Tue Nov 04 12:07:22 2008 +0900
+++ b/extras/mini-os/include/sched.h    Tue Nov 04 12:43:19 2008 +0900
@@ -48,8 +48,9 @@ void exit_thread(void) __attribute__((no
 void exit_thread(void) __attribute__((noreturn));
 void schedule(void);
 
+#ifdef __INSIDE_MINIOS__
 #define current get_current()
-
+#endif
 
 void wake(struct thread *thread);
 void block(struct thread *thread);
diff -r 10f0e1bb8e5e -r e75cb35c798b extras/mini-os/include/wait.h
--- a/extras/mini-os/include/wait.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/extras/mini-os/include/wait.h     Tue Nov 04 12:43:19 2008 +0900
@@ -7,7 +7,7 @@
 
 #define DEFINE_WAIT(name)                               \
 struct wait_queue name = {                              \
-    .thread       = current,                            \
+    .thread       = get_current(),                            \
     .thread_list  = MINIOS_LIST_HEAD_INIT((name).thread_list), \
 }
 
@@ -53,7 +53,7 @@ static inline void wake_up(struct wait_q
     unsigned long flags;        \
     local_irq_save(flags);      \
     add_wait_queue(&wq, &w);    \
-    block(current);             \
+    block(get_current());       \
     local_irq_restore(flags);   \
 } while (0)
 
@@ -74,8 +74,8 @@ static inline void wake_up(struct wait_q
         /* protect the list */                                  \
         local_irq_save(flags);                                  \
         add_wait_queue(&wq, &__wait);                           \
-        current->wakeup_time = deadline;                        \
-        clear_runnable(current);                                \
+        get_current()->wakeup_time = deadline;                  \
+        clear_runnable(get_current());                          \
         local_irq_restore(flags);                               \
         if((condition) || (deadline && NOW() >= deadline))      \
             break;                                              \
@@ -83,7 +83,7 @@ static inline void wake_up(struct wait_q
     }                                                           \
     local_irq_save(flags);                                      \
     /* need to wake up */                                       \
-    wake(current);                                              \
+    wake(get_current());                                        \
     remove_wait_queue(&__wait);                                 \
     local_irq_restore(flags);                                   \
 } while(0) 
diff -r 10f0e1bb8e5e -r e75cb35c798b extras/mini-os/minios.mk
--- a/extras/mini-os/minios.mk  Tue Nov 04 12:07:22 2008 +0900
+++ b/extras/mini-os/minios.mk  Tue Nov 04 12:43:19 2008 +0900
@@ -25,6 +25,9 @@ else
 else
 DEF_CFLAGS += -O3
 endif
+
+# Make the headers define our internal stuff
+DEF_CFLAGS += -D__INSIDE_MINIOS__
 
 # Build the CFLAGS and ASFLAGS for compiling and assembling.
 # DEF_... flags are the common mini-os flags,
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/Makefile
--- a/tools/Makefile    Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/Makefile    Tue Nov 04 12:43:19 2008 +0900
@@ -24,6 +24,7 @@ SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
 SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
 SUBDIRS-y += fs-back
 SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir
+SUBDIRS-y += xenpmd
 
 # These don't cross-compile
 ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/blktap/drivers/block-qcow.c
--- a/tools/blktap/drivers/block-qcow.c Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/blktap/drivers/block-qcow.c Tue Nov 04 12:43:19 2008 +0900
@@ -722,11 +722,11 @@ static inline void init_fds(struct disk_
 /* Open the disk file and initialize qcow state. */
 static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t 
flags)
 {
-       int fd, len, i, shift, ret, size, l1_table_size, o_flags;
+       int fd, len, i, shift, ret, size, l1_table_size, o_flags, 
l1_table_block;
        int max_aio_reqs;
        struct td_state     *bs = dd->td_state;
        struct tdqcow_state *s  = (struct tdqcow_state *)dd->private;
-       char *buf;
+       char *buf, *buf2;
        QCowHeader *header;
        QCowHeader_ext *exthdr;
        uint32_t cksum;
@@ -734,8 +734,8 @@ static int tdqcow_open (struct disk_driv
 
        DPRINTF("QCOW: Opening %s\n",name);
 
-       /* Since we don't handle O_DIRECT correctly, don't use it */
-       o_flags = O_LARGEFILE | ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
        fd = open(name, o_flags);
        if (fd < 0) {
                DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
@@ -819,9 +819,14 @@ static int tdqcow_open (struct disk_driv
                (int) (s->l1_size * sizeof(uint64_t)), 
                l1_table_size);
 
-       lseek(fd, s->l1_table_offset, SEEK_SET);
-       if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
+       lseek(fd, 0, SEEK_SET);
+       l1_table_block = l1_table_size + s->l1_table_offset;
+       l1_table_block = l1_table_block + 512 - (l1_table_block % 512); 
+       ret = posix_memalign((void **)&buf2, 4096, l1_table_block);
+       if (ret != 0) goto fail;
+       if (read(fd, buf2, l1_table_block) != l1_table_block)
                goto fail;
+       memcpy(s->l1_table, buf2 + s->l1_table_offset, l1_table_size);
 
        for(i = 0; i < s->l1_size; i++) {
                be64_to_cpus(&s->l1_table[i]);
@@ -871,8 +876,9 @@ static int tdqcow_open (struct disk_driv
 
                        DPRINTF("qcow: Converting image to big endian L1 
table\n");
 
-                       lseek(fd, s->l1_table_offset, SEEK_SET);
-                       if (write(fd, s->l1_table, l1_table_size) != 
l1_table_size) {
+                       memcpy(buf2 + s->l1_table_offset, s->l1_table, 
l1_table_size);
+                       lseek(fd, 0, SEEK_SET);
+                       if (write(fd, buf2, l1_table_block) != l1_table_block) {
                                DPRINTF("qcow: Failed to write new L1 table\n");
                                goto fail;
                        }
@@ -917,7 +923,7 @@ static int tdqcow_open (struct disk_driv
        init_fds(dd);
 
        if (!final_cluster)
-               s->fd_end = s->l1_table_offset + l1_table_size;
+               s->fd_end = l1_table_block;
        else {
                s->fd_end = lseek(fd, 0, SEEK_END);
                if (s->fd_end == (off_t)-1)
diff -r 10f0e1bb8e5e -r e75cb35c798b 
tools/firmware/hvmloader/acpi/static_tables.c
--- a/tools/firmware/hvmloader/acpi/static_tables.c     Tue Nov 04 12:07:22 
2008 +0900
+++ b/tools/firmware/hvmloader/acpi/static_tables.c     Tue Nov 04 12:43:19 
2008 +0900
@@ -67,7 +67,7 @@ struct acpi_20_fadt Fadt = {
 
     .p_lvl2_lat = 0x0fff, /* >100,  means we do not support C2 state */
     .p_lvl3_lat = 0x0fff, /* >1000, means we do not support C3 state */
-    .iapc_boot_arch = ACPI_LEGACY_DEVICES | ACPI_8042,
+    .iapc_boot_arch = ACPI_8042,
     .flags = (ACPI_PROC_C1 | ACPI_SLP_BUTTON |
               ACPI_WBINVD | ACPI_PWR_BUTTON |
               ACPI_FIX_RTC | ACPI_TMR_VAL_EXT),
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/firmware/rombios/rombios.c
--- a/tools/firmware/rombios/rombios.c  Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/firmware/rombios/rombios.c  Tue Nov 04 12:43:19 2008 +0900
@@ -7216,7 +7216,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n
         outb(0x03f5, head);
         outb(0x03f5, sector);
         outb(0x03f5, 2); // 512 byte sector size
-        outb(0x03f5, 0); // last sector number possible on track
+        outb(0x03f5, sector + num_sectors - 1); // last sector to read on track
         outb(0x03f5, 0); // Gap length
         outb(0x03f5, 0xff); // Gap length
 
@@ -7364,7 +7364,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n
         outb(0x03f5, head);
         outb(0x03f5, sector);
         outb(0x03f5, 2); // 512 byte sector size
-        outb(0x03f5, 0); // last sector number possible on track
+        outb(0x03f5, sector + num_sectors - 1); // last sector to write on 
track
         outb(0x03f5, 0); // Gap length
         outb(0x03f5, 0xff); // Gap length
 
diff -r 10f0e1bb8e5e -r e75cb35c798b 
tools/flask/policy/policy/modules/xen/xen.te
--- a/tools/flask/policy/policy/modules/xen/xen.te      Tue Nov 04 12:07:22 
2008 +0900
+++ b/tools/flask/policy/policy/modules/xen/xen.te      Tue Nov 04 12:43:19 
2008 +0900
@@ -74,7 +74,7 @@ allow dom0_t pirq_t:event {vector};
 allow dom0_t pirq_t:event {vector};
 allow dom0_t xen_t:mmu {memorymap};
 
-allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust};
+allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust updatemp};
 allow dom0_t dom0_t:grant {query setup};
 allow dom0_t dom0_t:domain {scheduler getdomaininfo getvcpuinfo 
getvcpuaffinity};
 
@@ -112,6 +112,7 @@ allow domU_t evchnU-0_t:event {send};
 
 allow dom0_t dom0_t:event {send};
 allow dom0_t domU_t:grant {copy};
+allow domU_t domU_t:grant {copy};
 
 manage_domain(dom0_t, domU_t)
 
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/util/diagnose.py
--- a/tools/python/xen/util/diagnose.py Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/util/diagnose.py Tue Nov 04 12:43:19 2008 +0900
@@ -23,7 +23,7 @@ from xen.xend.XendClient import server
 from xen.xend.XendClient import server
 from xen.xend.XendError import XendError
 from xen.xend.xenstore.xstransact import xstransact
-from xen.xend.server import DevController
+from xen.xend.server import DevConstants
 
 import xen.xend.XendProtocol
 
@@ -169,7 +169,7 @@ def diagnose_hotplugging():
 
 
 def stateString(state):
-    return state and DevController.xenbusState[int(state)] or '<None>'
+    return state and DevConstants.xenbusState[int(state)] or '<None>'
 
 
 def main(argv = None):
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/XendConfig.py       Tue Nov 04 12:43:19 2008 +0900
@@ -1602,21 +1602,21 @@ class XendConfig(dict):
         #   [vscsi,
         #     [dev,
         #       [devid, 0], [p-devname, sdb], [p-dev, 1:0:0:1],
-        #       [v-dev, 0:0:0:0], [state, Initialising]
+        #       [v-dev, 0:0:0:0], [state, 1]
         #     ],
         #     [dev,
         #       [devid, 0], [p-devname, sdc], [p-dev, 1:0:0:2],
-        #       [v-dev, 0:0:0:1], [satet, Initialising]
+        #       [v-dev, 0:0:0:1], [satet, 1]
         #     ]
         #   ],
         #   [vscsi,
         #     [dev,
         #       [devid, 1], [p-devname, sdg], [p-dev, 2:0:0:0],
-        #       [v-dev, 1:0:0:0], [state, Initialising]
+        #       [v-dev, 1:0:0:0], [state, 1]
         #     ],
         #     [dev,
         #       [devid, 1], [p-devname, sdh], [p-dev, 2:0:0:1],
-        #       [v-dev, 1:0:0:1], [satet, Initialising]
+        #       [v-dev, 1:0:0:1], [satet, 1]
         #     ]
         #   ]
         # ]
@@ -1632,18 +1632,19 @@ class XendConfig(dict):
         #   [vscsi,
         #     [dev,
         #       [devid, 0], [p-devname, sdd], [p-dev, 1:0:0:3],
-        #       [v-dev, 0:0:0:2], [state, Initialising]
+        #       [v-dev, 0:0:0:2], [state, 1]
         #     ]
         #   ]
         # ]
         #
-        # state 'Initialising' indicates that the device is being attached,
-        # while state 'Closing' indicates that the device is being detached.
+        # state xenbusState['Initialising'] indicates that the device is 
+        # being attached, while state xenbusState['Closing'] indicates 
+        # that the device is being detached.
         #
         # The Dict looks like this:
         #
         # { devs: [ {devid: 0, p-devname: sdd, p-dev: 1:0:0:3,
-        #            v-dev: 0:0:0:2, state: Initialising} ] }
+        #            v-dev: 0:0:0:2, state: 1} ] }
 
         dev_config = {}
 
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/XendDomainInfo.py   Tue Nov 04 12:43:19 2008 +0900
@@ -52,6 +52,7 @@ from xen.xend.xenstore.xswatch import xs
 from xen.xend.xenstore.xswatch import xswatch
 from xen.xend.XendConstants import *
 from xen.xend.XendAPIConstants import *
+from xen.xend.server.DevConstants import xenbusState
 
 from xen.xend.XendVMMetrics import XendVMMetrics
 
@@ -797,7 +798,7 @@ class XendDomainInfo:
         existing_dev_info = self._getDeviceInfo_vscsi(req_devid, dev['v-dev'])
         state = dev['state']
 
-        if state == 'Initialising':
+        if state == xenbusState['Initialising']:
             # new create
             # If request devid does not exist, create and exit.
             if existing_dev_info is None:
@@ -806,25 +807,48 @@ class XendDomainInfo:
             elif existing_dev_info == "exists":
                 raise XendError("The virtual device %s is already defined" % 
dev['v-dev'])
 
-        elif state == 'Closing':
+        elif state == xenbusState['Closing']:
             if existing_dev_info is None:
                 raise XendError("Cannot detach vscsi device does not exist")
 
-        # use DevController.reconfigureDevice to change device config
-        dev_control = self.getDeviceController(dev_class)
-        dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
-        dev_control.waitForDevice_reconfigure(req_devid)
-        num_devs = dev_control.cleanupDevice(req_devid)
-
-        # update XendConfig with new device info
-        if dev_uuid:
-            new_dev_sxp = dev_control.configuration(req_devid)
+        if self.domid is not None:
+            # use DevController.reconfigureDevice to change device config
+            dev_control = self.getDeviceController(dev_class)
+            dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
+            dev_control.waitForDevice_reconfigure(req_devid)
+            num_devs = dev_control.cleanupDevice(req_devid)
+
+            # update XendConfig with new device info
+            if dev_uuid:
+                new_dev_sxp = dev_control.configuration(req_devid)
+                self.info.device_update(dev_uuid, new_dev_sxp)
+
+            # If there is no device left, destroy vscsi and remove config.
+            if num_devs == 0:
+                self.destroyDevice('vscsi', req_devid)
+                del self.info['devices'][dev_uuid]
+
+        else:
+            cur_dev_sxp = self._getDeviceInfo_vscsi(req_devid, None)
+            new_dev_sxp = ['vscsi']
+            for cur_dev in sxp.children(cur_dev_sxp, 'dev'):
+                if state == xenbusState['Closing']:
+                    cur_dev_vdev = sxp.child_value(cur_dev, 'v-dev')
+                    if cur_dev_vdev == dev['v-dev']:
+                        continue
+                new_dev_sxp.append(cur_dev)
+
+            if state == xenbusState['Initialising']:
+                new_dev_sxp.append(sxp.child0(dev_sxp, 'dev'))
+
+            dev_uuid = sxp.child_value(cur_dev_sxp, 'uuid')
             self.info.device_update(dev_uuid, new_dev_sxp)
 
-        # If there is no device left, destroy vscsi and remove config.
-        if num_devs == 0:
-            self.destroyDevice('vscsi', req_devid)
-            del self.info['devices'][dev_uuid]
+            # If there is only 'vscsi' in new_dev_sxp, remove the config.
+            if len(sxp.children(new_dev_sxp, 'dev')) == 0:
+                del self.info['devices'][dev_uuid]
+
+        xen.xend.XendDomain.instance().managed_config_save(self)
 
         return True
 
@@ -986,7 +1010,17 @@ class XendDomainInfo:
             sxprs = []
             dev_num = 0
             for dev_type, dev_info in self.info.all_devices_sxpr():
-                if dev_type == deviceClass:
+                if dev_type != deviceClass:
+                    continue
+
+                if deviceClass == 'vscsi':
+                    vscsi_devs = ['devs', []]
+                    for vscsi_dev in sxp.children(dev_info, 'dev'):
+                        vscsi_dev.append(['frontstate', None])
+                        vscsi_devs[1].append(vscsi_dev)
+                        dev_num = int(sxp.child_value(vscsi_dev, 'devid'))
+                    sxprs.append([dev_num, [vscsi_devs]])
+                else:
                     sxprs.append([dev_num, dev_info])
                     dev_num += 1
             return sxprs
@@ -2380,11 +2414,10 @@ class XendDomainInfo:
             time.sleep(2)
         for paths in plist:
             if paths.find('backend') != -1:
-                from xen.xend.server import DevController
                 # Modify online status /before/ updating state (latter is 
watched by
                 # drivers, so this ordering avoids a race).
                 xstransact.Write(paths, 'online', "0")
-                xstransact.Write(paths, 'state', 
str(DevController.xenbusState['Closing']))
+                xstransact.Write(paths, 'state', str(xenbusState['Closing']))
             # force
             xstransact.Remove(paths)
 
@@ -3439,7 +3472,7 @@ class XendDomainInfo:
                     ['p-devname', pscsi.get_dev_name()],
                     ['p-dev', pscsi.get_physical_HCTL()],
                     ['v-dev', xenapi_dscsi.get('virtual_HCTL')],
-                    ['state', 'Initialising'],
+                    ['state', xenbusState['Initialising']],
                     ['uuid', dscsi_uuid]
                 ]
             ]
@@ -3558,7 +3591,7 @@ class XendDomainInfo:
         if target_dev is None:
             raise XendError('Failed to destroy device')
 
-        target_dev.append(['state', 'Closing'])
+        target_dev.append(['state', xenbusState['Closing']])
         target_vscsi_sxp = ['vscsi', target_dev]
 
         if self._stateGet() != XEN_API_VM_POWER_STATE_RUNNING:
diff -r 10f0e1bb8e5e -r e75cb35c798b 
tools/python/xen/xend/server/DevConstants.py
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/python/xen/xend/server/DevConstants.py      Tue Nov 04 12:43:19 
2008 +0900
@@ -0,0 +1,45 @@
+#============================================================================
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#============================================================================
+# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx>
+# Copyright (C) 2005 XenSource Ltd
+#============================================================================
+
+DEVICE_CREATE_TIMEOUT  = 100
+DEVICE_DESTROY_TIMEOUT = 100
+HOTPLUG_STATUS_NODE = "hotplug-status"
+HOTPLUG_ERROR_NODE  = "hotplug-error"
+HOTPLUG_STATUS_ERROR = "error"
+HOTPLUG_STATUS_BUSY  = "busy"
+
+Connected    = 1
+Error        = 2
+Missing      = 3
+Timeout      = 4
+Busy         = 5
+Disconnected = 6
+
+xenbusState = {
+    'Unknown'       : 0,
+    'Initialising'  : 1,
+    'InitWait'      : 2,
+    'Initialised'   : 3,
+    'Connected'     : 4,
+    'Closing'       : 5,
+    'Closed'        : 6,
+    'Reconfiguring' : 7,
+    'Reconfigured'  : 8,
+    }
+xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
+
diff -r 10f0e1bb8e5e -r e75cb35c798b 
tools/python/xen/xend/server/DevController.py
--- a/tools/python/xen/xend/server/DevController.py     Tue Nov 04 12:07:22 
2008 +0900
+++ b/tools/python/xen/xend/server/DevController.py     Tue Nov 04 12:43:19 
2008 +0900
@@ -23,41 +23,14 @@ from xen.xend.XendError import VmError
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
 import xen.xend.XendConfig
+from xen.xend.server.DevConstants import *
 
 from xen.xend.xenstore.xstransact import xstransact, complete
 from xen.xend.xenstore.xswatch import xswatch
 
 import os
 
-DEVICE_CREATE_TIMEOUT  = 100
-DEVICE_DESTROY_TIMEOUT = 100
-HOTPLUG_STATUS_NODE = "hotplug-status"
-HOTPLUG_ERROR_NODE  = "hotplug-error"
-HOTPLUG_STATUS_ERROR = "error"
-HOTPLUG_STATUS_BUSY  = "busy"
-
-Connected    = 1
-Error        = 2
-Missing      = 3
-Timeout      = 4
-Busy         = 5
-Disconnected = 6
-
-xenbusState = {
-    'Unknown'      : 0,
-    'Initialising' : 1,
-    'InitWait'     : 2,
-    'Initialised'  : 3,
-    'Connected'    : 4,
-    'Closing'      : 5,
-    'Closed'       : 6,
-    'Reconfiguring': 7,
-    'Reconfigured' : 8,
-    }
-
 xoptions = XendOptions.instance()
-
-xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
 
 
 class DevController:
@@ -569,7 +542,7 @@ class DevController:
             xswatch(statusPath, hotplugStatusCallback, ev, result)
             ev.wait(DEVICE_CREATE_TIMEOUT)
             err = xstransact.Read(statusPath, HOTPLUG_ERROR_NODE)
-            if result['status'] != 'Connected':
+            if result['status'] != Connected:
                 return (result['status'], err)
             
         backpath = self.readVm(devid, "backend")
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/iopif.py
--- a/tools/python/xen/xend/server/iopif.py     Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/server/iopif.py     Tue Nov 04 12:43:19 2008 +0900
@@ -45,8 +45,21 @@ def parse_ioport(val):
 
 class IOPortsController(DevController):
 
+    valid_cfg = ['to', 'from', 'uuid']
+
     def __init__(self, vm):
         DevController.__init__(self, vm)
+
+    def getDeviceConfiguration(self, devid, transaction = None):
+        result = DevController.getDeviceConfiguration(self, devid, transaction)
+        if transaction is None:
+            devinfo = self.readBackend(devid, *self.valid_cfg)
+        else:
+            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
+        config = dict(zip(self.valid_cfg, devinfo))
+        config = dict([(key, val) for key, val in config.items()
+                       if val != None])
+        return config
 
     def getDeviceDetails(self, config):
         """@see DevController.getDeviceDetails"""
@@ -81,4 +94,9 @@ class IOPortsController(DevController):
                 'ioports: Failed to configure legacy i/o range: %s - %s' %
                 (io_from, io_to))
 
-        return (None, {}, {})
+        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
+        return (self.allocateDeviceID(), back, {})
+
+    def waitForDevice(self, devid):
+        # don't wait for hotplug
+        return
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/irqif.py
--- a/tools/python/xen/xend/server/irqif.py     Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/server/irqif.py     Tue Nov 04 12:43:19 2008 +0900
@@ -39,6 +39,18 @@ class IRQController(DevController):
     def __init__(self, vm):
         DevController.__init__(self, vm)
 
+    valid_cfg = ['irq', 'uuid']
+
+    def getDeviceConfiguration(self, devid, transaction = None):
+        result = DevController.getDeviceConfiguration(self, devid, transaction)
+        if transaction is None:
+            devinfo = self.readBackend(devid, *self.valid_cfg)
+        else:
+            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
+        config = dict(zip(self.valid_cfg, devinfo))
+        config = dict([(key, val) for key, val in config.items()
+                       if val != None])
+        return config
 
     def getDeviceDetails(self, config):
         """@see DevController.getDeviceDetails"""
@@ -75,4 +87,9 @@ class IRQController(DevController):
         if rc < 0:
             raise VmError(
                 'irq: Failed to map irq %x' % (pirq))
-        return (None, {}, {})
+        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
+        return (self.allocateDeviceID(), back, {})
+
+    def waitForDevice(self, devid):
+        # don't wait for hotplug
+        return
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/pciif.py
--- a/tools/python/xen/xend/server/pciif.py     Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/server/pciif.py     Tue Nov 04 12:43:19 2008 +0900
@@ -25,7 +25,8 @@ from xen.xend.XendError import VmError
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
 
-from xen.xend.server.DevController import DevController, xenbusState
+from xen.xend.server.DevController import DevController
+from xen.xend.server.DevConstants import xenbusState
 
 import xen.lowlevel.xc
 
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/vscsiif.py
--- a/tools/python/xen/xend/server/vscsiif.py   Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/server/vscsiif.py   Tue Nov 04 12:43:19 2008 +0900
@@ -28,7 +28,8 @@ from xen.xend.XendError import VmError
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
 
-from xen.xend.server.DevController import DevController, xenbusState
+from xen.xend.server.DevController import DevController
+from xen.xend.server.DevConstants import xenbusState
 from xen.xend.xenstore.xstransact import xstransact
 
 class VSCSIController(DevController):
@@ -92,8 +93,8 @@ class VSCSIController(DevController):
             back[devpath + '/p-devname'] = pdevname
             vdev = vscsi_config.get('v-dev', '')
             back[devpath + '/v-dev'] = vdev
-            state = vscsi_config.get('state', '')
-            back[devpath + '/state'] = str(xenbusState[state])
+            state = vscsi_config.get('state', xenbusState['Unknown'])
+            back[devpath + '/state'] = str(state)
             devid = vscsi_config.get('devid', '')
             back[devpath + '/devid'] = str(devid)
 
@@ -168,17 +169,17 @@ class VSCSIController(DevController):
         (devid, back, front) = self.getDeviceDetails(config)
         devid = int(devid)
         vscsi_config = config['devs'][0]
-        state = vscsi_config.get('state', '')
+        state = vscsi_config.get('state', xenbusState['Unknown'])
         driver_state = self.readBackend(devid, 'state')
         if str(xenbusState['Connected']) != driver_state:
             raise VmError("Driver status is not connected")
 
         uuid = self.readBackend(devid, 'uuid')
-        if state == 'Initialising':
+        if state == xenbusState['Initialising']:
             back['uuid'] = uuid
             self.writeBackend(devid, back)
 
-        elif state == 'Closing':
+        elif state == xenbusState['Closing']:
             found = False
             devs = self.readBackendList(devid, "vscsi-devs")
             vscsipath = "vscsi-devs/"
@@ -198,7 +199,7 @@ class VSCSIController(DevController):
 
         else:
             raise XendError("Error configuring device invalid "
-                            "state '%s'" % state)
+                            "state '%s'" % xenbusState[state])
 
         self.writeBackend(devid, 'state', str(xenbusState['Reconfiguring']))
         return self.readBackend(devid, 'uuid')
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xm/create.py     Tue Nov 04 12:43:19 2008 +0900
@@ -32,6 +32,7 @@ from xen.xend import osdep
 from xen.xend import osdep
 import xen.xend.XendClient
 from xen.xend.XendBootloader import bootloader
+from xen.xend.server.DevConstants import xenbusState
 from xen.util import blkif
 from xen.util import vscsi_util
 import xen.util.xsm.xsm as security
@@ -707,7 +708,7 @@ def configure_vscsis(config_devs, vals):
             vscsi_util.vscsi_get_hctl_and_devname_by(p_dev, scsi_devices)
 
         if p_hctl == None:
-            raise ValueError("Cannot find device \"%s\"" % p_dev)
+            raise ValueError('Cannot find device "%s"' % p_dev)
 
         for config in config_scsi:
             dev = vscsi_convert_sxp_to_dict(config)
@@ -717,7 +718,7 @@ def configure_vscsis(config_devs, vals):
         v_hctl = v_dev.split(':')
         devid = int(v_hctl[0])
         config_scsi.append(['dev', \
-                        ['state', 'Initialising'], \
+                        ['state', xenbusState['Initialising']], \
                         ['devid', devid], \
                         ['p-dev', p_hctl], \
                         ['p-devname', devname], \
@@ -1035,6 +1036,14 @@ def preprocess_ioports(vals):
         ioports.append(hexd)
     vals.ioports = ioports
         
+def preprocess_irq(vals):
+    if not vals.irq: return
+    irq = []
+    for v in vals.irq:
+        d = repr(v)
+        irq.append(d)
+    vals.irq = irq
+
 def preprocess_vtpm(vals):
     if not vals.vtpm: return
     vtpms = []
@@ -1133,6 +1142,7 @@ def preprocess(vals):
     preprocess_vscsi(vals)
     preprocess_ioports(vals)
     preprocess_ip(vals)
+    preprocess_irq(vals)
     preprocess_nfs(vals)
     preprocess_vtpm(vals)
     preprocess_access_control(vals)
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xm/main.py       Tue Nov 04 12:43:19 2008 +0900
@@ -47,6 +47,7 @@ from xen.xend import sxp
 from xen.xend import sxp
 from xen.xend import XendClient
 from xen.xend.XendConstants import *
+from xen.xend.server.DevConstants import xenbusState
 
 from xen.xm.opts import OptionError, Opts, wrap, set_true
 from xen.xm import console
@@ -2515,7 +2516,7 @@ def xm_scsi_attach(args):
     dom = args[0]
     p_scsi = args[1]
     v_hctl = args[2]
-    scsi = parse_scsi_configuration(p_scsi, v_hctl, 'Initialising')
+    scsi = parse_scsi_configuration(p_scsi, v_hctl, 
xenbusState['Initialising'])
 
     if serverType == SERVER_XEN_API:
 
@@ -2635,7 +2636,7 @@ def xm_scsi_detach(args):
     arg_check(args, 'scsi-detach', 2)
     dom = args[0]
     v_hctl = args[1]
-    scsi = parse_scsi_configuration(None, v_hctl, 'Closing')
+    scsi = parse_scsi_configuration(None, v_hctl, xenbusState['Closing'])
 
     if serverType == SERVER_XEN_API:
 
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/xenpmd/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenpmd/Makefile     Tue Nov 04 12:43:19 2008 +0900
@@ -0,0 +1,20 @@
+XEN_ROOT=../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS  += -Werror
+CFLAGS  += $(CFLAGS_libxenstore)
+LDFLAGS += $(LDFLAGS_libxenstore)
+
+BIN      = xenpmd
+
+.PHONY: all
+all: $(BIN)
+
+.PHONY: install
+install: all
+       $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
+       $(INSTALL_PROG) $(BIN) $(DESTDIR)$(SBINDIR)
+
+.PHONY: clean
+clean:
+       $(RM) -f $(BIN)
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/xenpmd/xenpmd.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenpmd/xenpmd.c     Tue Nov 04 12:43:19 2008 +0900
@@ -0,0 +1,520 @@
+/*
+ * xenpmd.c
+ *
+ * xen power management daemon - Facilitates power management 
+ * functionality within xen guests.
+ *
+ * Copyright (c) 2008  Kamala Narasimhan 
+ * Copyright (c) 2008  Citrix Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* Xen extended power management support provides HVM guest power management
+ * features beyond S3, S4, S5.  For example, it helps expose system level 
+ * battery status and battery meter information and in future will be extended
+ * to include more power management support.  This extended power management 
+ * support is enabled by setting xen_extended_power_mgmt to 1 or 2 in the HVM
+ * config file.  When set to 2, non-pass through mode is enabled which heavily
+ * relies on this power management daemon to glean battery information from 
+ * dom0 and store it xenstore which would then be queries and used by qemu and 
+ * passed to the guest when appropriate battery ports are read/written to.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <xs.h>
+
+/* #define RUN_STANDALONE */
+#define RUN_IN_SIMULATE_MODE
+
+enum BATTERY_INFO_TYPE {
+    BIF, 
+    BST 
+};
+
+enum BATTERY_PRESENT {
+    NO, 
+    YES 
+};
+
+enum BATTERY_TECHNOLOGY {
+    NON_RECHARGEABLE, 
+    RECHARGEABLE 
+};
+
+struct battery_info {
+    enum BATTERY_PRESENT    present;
+    unsigned long           design_capacity;
+    unsigned long           last_full_capacity;
+    enum BATTERY_TECHNOLOGY battery_technology;
+    unsigned long           design_voltage;
+    unsigned long           design_capacity_warning;
+    unsigned long           design_capacity_low;
+    unsigned long           capacity_granularity_1;
+    unsigned long           capacity_granularity_2;
+    char                    model_number[32];
+    char                    serial_number[32];
+    char                    battery_type[32];
+    char                    oem_info[32];
+};
+
+struct battery_status {
+    enum BATTERY_PRESENT    present;
+    unsigned long           state;
+    unsigned long           present_rate;
+    unsigned long           remaining_capacity;
+    unsigned long           present_voltage;
+};
+
+static struct xs_handle *xs;
+
+#ifdef RUN_IN_SIMULATE_MODE
+    #define BATTERY_DIR_PATH "/tmp/battery"
+    #define BATTERY_INFO_FILE_PATH "/tmp/battery/%s/info" 
+    #define BATTERY_STATE_FILE_PATH "/tmp/battery/%s/state"
+#else
+    #define BATTERY_DIR_PATH "/proc/acpi/battery"
+    #define BATTERY_INFO_FILE_PATH "/proc/acpi/battery/%s/info"
+    #define BATTERY_STATE_FILE_PATH "/proc/acpi/battery/%s/state"
+#endif
+
+FILE *get_next_battery_file(DIR *battery_dir, 
+                            enum BATTERY_INFO_TYPE battery_info_type)
+{
+    FILE *file = 0;
+    struct dirent *dir_entries;
+    char file_name[32];
+    
+    do 
+    {
+        dir_entries = readdir(battery_dir);
+        if ( !dir_entries ) 
+            return 0;
+        if ( strlen(dir_entries->d_name) < 4 )
+            continue;
+        if ( battery_info_type == BIF ) 
+            snprintf(file_name, 32, BATTERY_INFO_FILE_PATH,
+                     dir_entries->d_name);
+        else 
+            snprintf(file_name, 32, BATTERY_STATE_FILE_PATH,
+                     dir_entries->d_name);
+        file = fopen(file_name, "r");
+    } while ( !file );
+
+    return file;
+}
+
+void set_attribute_battery_info(char *attrib_name,
+                                char *attrib_value,
+                                struct battery_info *info)
+{
+    if ( strstr(attrib_name, "present") ) 
+    {
+        if ( strstr(attrib_value, "yes") ) 
+            info->present = YES;
+        return;
+    }
+
+    if ( strstr(attrib_name, "design capacity warning") ) 
+    {
+        info->design_capacity_warning = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "design capacity low") ) 
+    {
+        info->design_capacity_low = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "design capacity") ) 
+    { 
+        info->design_capacity = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "last full capacity") ) 
+    {
+        info->last_full_capacity = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "design voltage") ) 
+    {
+        info->design_voltage = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "capacity granularity 1") ) 
+    {
+        info->capacity_granularity_1 = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "capacity granularity 2") ) 
+    {
+        info->capacity_granularity_2 = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "battery technology") ) 
+    {
+        if ( strncmp(attrib_value, "rechargeable",
+                     strlen("rechargeable")) == 0 ) 
+            info->battery_technology = RECHARGEABLE;
+        else 
+            info->battery_technology = NON_RECHARGEABLE;
+        return;
+    }
+
+    if ( strstr(attrib_name, "model number") ) 
+    {
+        strncpy(info->model_number, attrib_value, 32);
+        return;
+    }
+
+    if ( strstr(attrib_name, "serial number") ) 
+    {
+        strncpy(info->serial_number, attrib_value, 32);
+        return;
+    }
+
+    if ( strstr(attrib_name, "battery type") ) 
+    {
+        strncpy(info->battery_type, attrib_value, 32);
+        return;
+    }
+
+    if ( strstr(attrib_name, "OEM info") ) 
+    {
+        strncpy(info->oem_info, attrib_value, 32);
+        return;
+    }
+
+    return;
+}
+
+void set_attribute_battery_status(char *attrib_name, 
+                                  char *attrib_value,
+                                  struct battery_status *status)
+{
+    if ( strstr(attrib_name, "charging state") ) 
+    {
+        /* Check this, below is half baked */
+        if ( strstr(attrib_value, "charged") ) 
+            status->state = 0;
+        else 
+            status->state = 1;
+        return;
+    }
+
+    if ( strstr(attrib_name, "present rate") ) 
+    {
+        status->present_rate = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "remaining capacity") ) 
+    {
+        status->remaining_capacity = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "present voltage") ) 
+    {
+        status->present_voltage = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "present") ) 
+    {
+        if ( strstr(attrib_value, "yes") ) 
+            status->present = YES;
+        return;
+    }
+}
+
+void parse_battery_info_or_status(char *line_info,
+                                  enum BATTERY_INFO_TYPE type,
+                                  void *info_or_status)
+{
+    char attrib_name[128];
+    char attrib_value[64];
+    char *delimiter;
+    unsigned long length;
+
+    length = strlen(line_info);
+    delimiter = (char *) strchr( line_info, ':');
+    if ( (!delimiter) || (delimiter == line_info) ||
+         (delimiter == line_info + length) ) 
+        return;
+
+    strncpy(attrib_name, line_info, delimiter-line_info);
+    while ( *(delimiter+1) == ' ' ) 
+    {
+        delimiter++;
+        if ( delimiter+1 == line_info + length)
+            return;
+    }
+    strncpy(attrib_value, delimiter+1, 
+            (unsigned long)line_info + length -(unsigned long)delimiter); 
+    
+    if ( type == BIF ) 
+        set_attribute_battery_info(attrib_name, attrib_value,
+                                   (struct battery_info *)info_or_status);
+    else 
+        set_attribute_battery_status(attrib_name, attrib_value,
+                                     (struct battery_status *)info_or_status);
+
+    return;
+}
+
+int get_next_battery_info_or_status(DIR *battery_dir,
+                                    enum BATTERY_INFO_TYPE type,
+                                    void *info_or_status)
+{
+    FILE *file;
+    char line_info[256];
+
+    if  ( !info_or_status )
+        return 0;
+
+    memset(line_info, 0, 256);
+    if (type == BIF) 
+        memset(info_or_status, 0, sizeof(struct battery_info));
+    else 
+        memset(info_or_status, 0, sizeof(struct battery_status));
+
+    file = get_next_battery_file(battery_dir, type);
+    if ( !file )
+        return 0;
+
+    while ( fgets(line_info, 1024, file) != NULL ) 
+    {
+        parse_battery_info_or_status(line_info, type, info_or_status);
+        memset(line_info, 0, 256);
+    }
+
+    fclose(file);
+    return 1;
+}
+
+#ifdef RUN_STANDALONE
+void print_battery_info(struct battery_info *info)
+{
+    printf("present:                %d\n", info->present);
+    printf("design capacity:        %d\n", info->design_capacity);
+    printf("last full capacity:     %d\n", info->last_full_capacity);
+    printf("battery technology:     %d\n", info->battery_technology);
+    printf("design voltage:         %d\n", info->design_voltage);
+    printf("design capacity warning:%d\n", info->design_capacity_warning);
+    printf("design capacity low:    %d\n", info->design_capacity_low);
+    printf("capacity granularity 1: %d\n", info->capacity_granularity_1);
+    printf("capacity granularity 2: %d\n", info->capacity_granularity_2);
+    printf("model number:           %s\n", info->model_number);
+    printf("serial number:          %s\n", info->serial_number);
+    printf("battery type:           %s\n", info->battery_type);
+    printf("OEM info:               %s\n", info->oem_info);
+}
+#endif /*RUN_STANDALONE*/
+
+void write_ulong_lsb_first(char *temp_val, unsigned long val)
+{
+    snprintf(temp_val, 9, "%02x%02x%02x%02x", (unsigned int)val & 0xff, 
+    (unsigned int)(val & 0xff00) >> 8, (unsigned int)(val & 0xff0000) >> 16, 
+    (unsigned int)(val & 0xff000000) >> 24);
+}
+
+void write_battery_info_to_xenstore(struct battery_info *info)
+{
+    char val[1024], string_info[256];
+
+    xs_mkdir(xs, XBT_NULL, "/pm");
+   
+    memset(val, 0, 1024);
+    memset(string_info, 0, 256);
+    /* write 9 dwords (so 9*4) + length of 4 strings + 4 null terminators */
+    snprintf(val, 3, "%02x", 
+             (unsigned int)(9*4 +
+                            strlen(info->model_number) +
+                            strlen(info->serial_number) +
+                            strlen(info->battery_type) +
+                            strlen(info->oem_info) + 4));
+    write_ulong_lsb_first(val+2, info->present);
+    write_ulong_lsb_first(val+10, info->design_capacity);
+    write_ulong_lsb_first(val+18, info->last_full_capacity);
+    write_ulong_lsb_first(val+26, info->battery_technology);
+    write_ulong_lsb_first(val+34, info->design_voltage);
+    write_ulong_lsb_first(val+42, info->design_capacity_warning);
+    write_ulong_lsb_first(val+50, info->design_capacity_low);
+    write_ulong_lsb_first(val+58, info->capacity_granularity_1);
+    write_ulong_lsb_first(val+66, info->capacity_granularity_2);
+
+    snprintf(string_info, 256, "%02x%s%02x%s%02x%s%02x%s", 
+             (unsigned int)strlen(info->model_number), info->model_number,
+             (unsigned int)strlen(info->serial_number), info->serial_number,
+             (unsigned int)strlen(info->battery_type), info->battery_type,
+             (unsigned int)strlen(info->oem_info), info->oem_info);
+    strncat(val+73, string_info, 1024);
+    xs_write(xs, XBT_NULL, "/pm/bif", 
+             val, 73+8+strlen(info->model_number)+strlen(info->serial_number)+
+             strlen(info->battery_type)+strlen(info->oem_info)+1);
+}
+
+int write_one_time_battery_info(void)
+{
+    DIR *dir;
+    int ret = 0;
+    struct battery_info info;
+    
+    dir = opendir(BATTERY_DIR_PATH);
+    if ( !dir )
+        return 0;
+
+    while ( get_next_battery_info_or_status(dir, BIF, (void *)&info) ) 
+    {
+#ifdef RUN_STANDALONE
+        print_battery_info(&info);
+#endif
+        if ( info.present == YES ) 
+        {
+            write_battery_info_to_xenstore(&info);
+            ret = 1;
+            break; /* rethink this... */
+        }
+    }
+
+    closedir(dir);
+    return ret;
+}
+
+#ifdef RUN_STANDALONE
+void print_battery_status(struct battery_status *status)
+{
+    printf("present:                     %d\n", status->present);
+    printf("Battery state                %d\n", status->state);
+    printf("Battery present rate         %d\n", status->present_rate);
+    printf("Battery remining capacity    %d\n", status->remaining_capacity);
+    printf("Battery present voltage      %d\n", status->present_voltage);
+}
+#endif /*RUN_STANDALONE*/
+
+void write_battery_status_to_xenstore(struct battery_status *status)
+{
+    char val[35];
+
+    xs_mkdir(xs, XBT_NULL, "/pm");
+
+    memset(val, 0, 35);
+    snprintf(val, 3, "%02x", 16);
+    write_ulong_lsb_first(val+2, status->state);
+    write_ulong_lsb_first(val+10, status->present_rate);
+    write_ulong_lsb_first(val+18, status->remaining_capacity);
+    write_ulong_lsb_first(val+26, status->present_voltage);
+
+    xs_write(xs, XBT_NULL, "/pm/bst", val, 35);
+}
+
+int wait_for_and_update_battery_status_request(void)
+{
+    DIR *dir;
+    int ret = 0;
+    unsigned int count;
+    struct battery_status status;
+
+    while ( true )
+    {
+        /* KN:@TODO - It is rather inefficient to not cache the file handle.
+         *  Switch to caching file handle. 
+         */
+        dir = opendir(BATTERY_DIR_PATH);
+        if ( !dir )
+            return 0;
+
+        while ( get_next_battery_info_or_status(dir, BST, (void *)&status) ) 
+        {
+#ifdef RUN_STANDALONE
+            print_battery_status(&status);
+#endif
+            if ( status.present == YES ) 
+            {
+                write_battery_status_to_xenstore(&status);
+                ret = 1;
+                /* rethink this; though I have never seen, there might be
+                 * systems out there with more than one battery device 
+                 * present
+                 */
+                break;
+            }
+        }
+        closedir(dir);
+        xs_watch(xs, "/pm/events", "refreshbatterystatus");
+        xs_read_watch(xs, &count); 
+    }
+
+    return ret;
+}
+
+/* Borrowed daemonize from xenstored - Initially written by Stevens. */
+static void daemonize(void)
+{
+    pid_t pid;
+
+    if ( (pid = fork()) < 0 )
+        exit(1);
+
+    if ( pid != 0 )
+        exit(0);
+
+    setsid();
+
+    if ( (pid = fork()) < 0 )
+        exit(1);
+
+    if ( pid != 0 )
+        exit(0);
+
+    if ( chdir("/") == -1 )
+        exit(1);
+
+    umask(0);
+}
+
+int main(int argc, char *argv[])
+{
+#ifndef RUN_STANDALONE
+    daemonize();
+#endif
+    xs = (struct xs_handle *)xs_daemon_open();
+    if ( xs == NULL ) 
+        return -1;
+
+    if ( write_one_time_battery_info() == 0 ) 
+    {
+        xs_daemon_close(xs);
+        return -1;
+    }
+
+    wait_for_and_update_battery_status_request();
+    xs_daemon_close(xs);
+    return 0;
+}
+
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/ia64/xen/cpufreq/cpufreq.c
--- a/xen/arch/ia64/xen/cpufreq/cpufreq.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/ia64/xen/cpufreq/cpufreq.c       Tue Nov 04 12:43:19 2008 +0900
@@ -210,21 +210,6 @@ acpi_cpufreq_cpu_init (struct cpufreq_po
 
        data->acpi_data = &processor_pminfo[cpu]->perf;
 
-       /* capability check */
-       if (data->acpi_data->state_count <= 1) {
-               printk(KERN_WARNING "P-States\n");
-               result = -ENODEV;
-               goto err_unreg;
-       }
-
-       if ((data->acpi_data->control_register.space_id !=
-                               ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-                       (data->acpi_data->status_register.space_id !=
-                        ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-               result = -ENODEV;
-               goto err_unreg;
-       }
-
        data->freq_table = xmalloc_array(struct cpufreq_frequency_table,
                        (data->acpi_data->state_count + 1));
        if (!data->freq_table) {
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/ia64/xen/irq.c
--- a/xen/arch/ia64/xen/irq.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/ia64/xen/irq.c   Tue Nov 04 12:43:19 2008 +0900
@@ -74,7 +74,7 @@ unsigned int __ia64_local_vector_to_irq 
 /*
  * Controller mappings for all interrupt sources:
  */
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+irq_desc_t irq_desc[NR_IRQS] = {
        [0 ... NR_IRQS-1] = {
                .status = IRQ_DISABLED,
                .handler = &no_irq_type,
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpu_idle.c
--- a/xen/arch/x86/acpi/cpu_idle.c      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/acpi/cpu_idle.c      Tue Nov 04 12:43:19 2008 +0900
@@ -75,13 +75,14 @@ static void print_acpi_power(uint32_t cp
 
     printk("==cpu%d==\n", cpu);
     printk("active state:\t\tC%d\n",
-           power->last_state ? (int)(power->last_state - power->states) : -1);
+           power->last_state ? power->last_state->idx : -1);
     printk("max_cstate:\t\tC%d\n", max_cstate);
     printk("states:\n");
     
     for ( i = 1; i < power->count; i++ )
     {
-        printk((power->last_state == &power->states[i]) ? "   *" : "    ");
+        printk((power->last_state && power->last_state->idx == i) ?
+               "   *" : "    ");
         printk("C%d:\t", i);
         printk("type[C%d] ", power->states[i].type);
         printk("latency[%03d] ", power->states[i].latency);
@@ -139,20 +140,26 @@ static void acpi_processor_ffh_cstate_en
 
 static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
-    if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
-    {
+    int unused;
+
+    switch ( cx->entry_method )
+    {
+    case ACPI_CSTATE_EM_FFH:
         /* Call into architectural FFH based C-state */
         acpi_processor_ffh_cstate_enter(cx);
-    }
-    else
-    {
-        int unused;
+        return;
+    case ACPI_CSTATE_EM_SYSIO:
         /* IO port based C-state */
         inb(cx->address);
         /* Dummy wait op - must do something useless after P_LVL2 read
            because chipsets cannot guarantee that STPCLK# signal
            gets asserted in time to freeze execution properly. */
         unused = inl(pmtmr_ioport);
+        return;
+    case ACPI_CSTATE_EM_HALT:
+        acpi_safe_halt();
+        local_irq_disable();
+        return;
     }
 }
 
@@ -222,7 +229,7 @@ static void acpi_processor_idle(void)
         if ( power->flags.bm_check && acpi_idle_bm_check()
              && cx->type == ACPI_STATE_C3 )
             cx = power->safe_state;
-        if ( cx - &power->states[0] > max_cstate )
+        if ( cx->idx > max_cstate )
             cx = &power->states[max_cstate];
     }
     if ( !cx )
@@ -252,35 +259,11 @@ static void acpi_processor_idle(void)
     switch ( cx->type )
     {
     case ACPI_STATE_C1:
-        /* Trace cpu idle entry */
-        TRACE_1D(TRC_PM_IDLE_ENTRY, 1);
-
-        /*
-         * Invoke C1.
-         * Use the appropriate idle routine, the one that would
-         * be used without acpi C-states.
-         */
-        if ( pm_idle_save )
-            pm_idle_save();
-        else 
-            acpi_safe_halt();
-
-        /* Trace cpu idle exit */
-        TRACE_1D(TRC_PM_IDLE_EXIT, 1);
-
-        /*
-         * TBD: Can't get time duration while in C1, as resumes
-         *      go to an ISR rather than here.  Need to instrument
-         *      base interrupt handler.
-         */
-        sleep_ticks = 0xFFFFFFFF;
-        break;
-
     case ACPI_STATE_C2:
-        if ( local_apic_timer_c2_ok )
+        if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
         {
             /* Trace cpu idle entry */
-            TRACE_1D(TRC_PM_IDLE_ENTRY, 2);
+            TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx);
             /* Get start time (ticks) */
             t1 = inl(pmtmr_ioport);
             /* Invoke C2 */
@@ -288,7 +271,7 @@ static void acpi_processor_idle(void)
             /* Get end time (ticks) */
             t2 = inl(pmtmr_ioport);
             /* Trace cpu idle exit */
-            TRACE_1D(TRC_PM_IDLE_EXIT, 2);
+            TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx);
 
             /* Re-enable interrupts */
             local_irq_enable();
@@ -328,7 +311,7 @@ static void acpi_processor_idle(void)
         }
 
         /* Trace cpu idle entry */
-        TRACE_1D(TRC_PM_IDLE_ENTRY, cx - &power->states[0]);
+        TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx);
         /*
          * Before invoking C3, be aware that TSC/APIC timer may be 
          * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
@@ -349,7 +332,7 @@ static void acpi_processor_idle(void)
         /* recovering TSC */
         cstate_restore_tsc();
         /* Trace cpu idle exit */
-        TRACE_1D(TRC_PM_IDLE_EXIT, cx - &power->states[0]);
+        TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx);
 
         if ( power->flags.bm_check && power->flags.bm_control )
         {
@@ -387,9 +370,15 @@ static void acpi_processor_idle(void)
 
 static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
 {
+    int i;
+
     memset(acpi_power, 0, sizeof(*acpi_power));
 
+    for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+        acpi_power->states[i].idx = i;
+
     acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
+    acpi_power->states[ACPI_STATE_C1].entry_method = ACPI_CSTATE_EM_HALT;
 
     acpi_power->states[ACPI_STATE_C0].valid = 1;
     acpi_power->states[ACPI_STATE_C1].valid = 1;
@@ -486,16 +475,13 @@ static int check_cx(struct acpi_processo
         break;
 
     case ACPI_ADR_SPACE_FIXED_HARDWARE:
-        if ( cx->type > ACPI_STATE_C1 )
-        {
-            if ( cx->reg.bit_width != VENDOR_INTEL || 
-                 cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
-                return -EINVAL;
-
-            /* assume all logical cpu has the same support for mwait */
-            if ( acpi_processor_ffh_cstate_probe(cx) )
-                return -EINVAL;
-        }
+        if ( cx->reg.bit_width != VENDOR_INTEL || 
+             cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
+            return -EINVAL;
+
+        /* assume all logical cpu has the same support for mwait */
+        if ( acpi_processor_ffh_cstate_probe(cx) )
+            return -EINVAL;
         break;
 
     default:
@@ -599,7 +585,23 @@ static void set_cx(
     cx->valid    = 1;
     cx->type     = xen_cx->type;
     cx->address  = xen_cx->reg.address;
-    cx->space_id = xen_cx->reg.space_id;
+
+    switch ( xen_cx->reg.space_id )
+    {
+    case ACPI_ADR_SPACE_FIXED_HARDWARE:
+        if ( xen_cx->reg.bit_width == VENDOR_INTEL &&
+             xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT )
+            cx->entry_method = ACPI_CSTATE_EM_FFH;
+        else
+            cx->entry_method = ACPI_CSTATE_EM_HALT;
+        break;
+    case ACPI_ADR_SPACE_SYSTEM_IO:
+        cx->entry_method = ACPI_CSTATE_EM_SYSIO;
+        break;
+    default:
+        cx->entry_method = ACPI_CSTATE_EM_NONE;
+    }
+
     cx->latency  = xen_cx->latency;
     cx->power    = xen_cx->power;
     
@@ -761,8 +763,7 @@ int pmstat_get_cx_stat(uint32_t cpuid, s
         return 0;
     }
 
-    stat->last = (power->last_state) ?
-        (int)(power->last_state - &power->states[0]) : 0;
+    stat->last = power->last_state ? power->last_state->idx : 0;
     stat->nr = power->count;
     stat->idle_time = v->runstate.time[RUNSTATE_running];
     if ( v->is_running )
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpufreq/cpufreq.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c       Tue Nov 04 12:43:19 2008 +0900
@@ -370,7 +370,7 @@ static int acpi_cpufreq_target(struct cp
     if (!check_freqs(cmd.mask, freqs.new, data))
         return -EAGAIN;
 
-    for_each_cpu_mask(j, cmd.mask)
+    for_each_cpu_mask(j, online_policy_cpus)
         cpufreq_statistic_update(j, perf->state, next_perf_state);
 
     perf->state = next_perf_state;
@@ -447,18 +447,6 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
     perf = data->acpi_data;
     policy->shared_type = perf->shared_type;
 
-    /* capability check */
-    if (perf->state_count <= 1) {
-        printk("No P-States\n");
-        result = -ENODEV;
-        goto err_unreg;
-    }
-
-    if (perf->control_register.space_id != perf->status_register.space_id) {
-        result = -ENODEV;
-        goto err_unreg;
-    }
-
     switch (perf->control_register.space_id) {
     case ACPI_ADR_SPACE_SYSTEM_IO:
         printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpufreq/powernow.c
--- a/xen/arch/x86/acpi/cpufreq/powernow.c      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c      Tue Nov 04 12:43:19 2008 +0900
@@ -229,9 +229,23 @@ err_unreg:
     return result;
 }
 
+static int powernow_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+    struct powernow_cpufreq_data *data = drv_data[policy->cpu];
+
+    if (data) {
+        drv_data[policy->cpu] = NULL;
+        xfree(data->freq_table);
+        xfree(data);
+    }
+
+    return 0;
+}
+
 static struct cpufreq_driver powernow_cpufreq_driver = {
     .target = powernow_cpufreq_target,
     .init   = powernow_cpufreq_cpu_init,
+    .exit   = powernow_cpufreq_cpu_exit
 };
 
 int powernow_cpufreq_init(void)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpuidle_menu.c
--- a/xen/arch/x86/acpi/cpuidle_menu.c  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/acpi/cpuidle_menu.c  Tue Nov 04 12:43:19 2008 +0900
@@ -59,7 +59,7 @@ static int menu_select(struct acpi_proce
     data->expected_us = (u32) get_sleep_length_ns() / 1000;
 
     /* find the deepest idle state that satisfies our constraints */
-    for ( i = 1; i < power->count; i++ )
+    for ( i = 2; i < power->count; i++ )
     {
         struct acpi_processor_cx *s = &power->states[i];
 
@@ -81,17 +81,7 @@ static void menu_reflect(struct acpi_pro
     unsigned int last_residency; 
     unsigned int measured_us;
 
-    /*
-     * Ugh, this idle state doesn't support residency measurements, so we
-     * are basically lost in the dark.  As a compromise, assume we slept
-     * for one full standard timer tick.  However, be aware that this
-     * could potentially result in a suboptimal state transition.
-     */
-    if ( target->type == ACPI_STATE_C1 )
-        last_residency = USEC_PER_SEC / HZ;
-    else
-        last_residency = power->last_residency;
-
+    last_residency = power->last_residency;
     measured_us = last_residency + data->elapsed_us;
 
     /* if wrapping, set to max uint (-1) */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/domain.c     Tue Nov 04 12:43:19 2008 +0900
@@ -174,9 +174,10 @@ void free_vcpu_struct(struct vcpu *v)
 
 static int setup_compat_l4(struct vcpu *v)
 {
-    struct page_info *pg = alloc_domheap_page(NULL, 0);
+    struct page_info *pg;
     l4_pgentry_t *l4tab;
 
+    pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
     if ( pg == NULL )
         return -ENOMEM;
 
@@ -1639,31 +1640,22 @@ static int relinquish_memory(
         }
 
         if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
-            put_page_and_type(page);
+            ret = put_page_and_type_preemptible(page, 1);
+        switch ( ret )
+        {
+        case 0:
+            break;
+        case -EAGAIN:
+        case -EINTR:
+            set_bit(_PGT_pinned, &page->u.inuse.type_info);
+            put_page(page);
+            goto out;
+        default:
+            BUG();
+        }
 
         if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
             put_page(page);
-
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-        /*
-         * Forcibly drop reference counts of page tables above top most (which
-         * were skipped to prevent long latencies due to deep recursion - see
-         * the special treatment in free_lX_table()).
-         */
-        y = page->u.inuse.type_info;
-        if ( (type < PGT_root_page_table) &&
-             unlikely(((y + PGT_type_mask) &
-                       (PGT_type_mask|PGT_validated)) == type) )
-        {
-            BUG_ON((y & PGT_count_mask) >=
-                   (page->count_info & PGC_count_mask));
-            while ( y & PGT_count_mask )
-            {
-                put_page_and_type(page);
-                y = page->u.inuse.type_info;
-            }
-        }
-#endif
 
         /*
          * Forcibly invalidate top-most, still valid page tables at this point
@@ -1685,8 +1677,31 @@ static int relinquish_memory(
                         x & ~(PGT_validated|PGT_partial));
             if ( likely(y == x) )
             {
-                if ( free_page_type(page, x, 0) != 0 )
+                /* No need for atomic update of type_info here: noone else 
updates it. */
+                switch ( ret = free_page_type(page, x, 1) )
+                {
+                case 0:
+                    break;
+                case -EINTR:
+                    page->u.inuse.type_info |= PGT_validated;
+                    if ( x & PGT_partial )
+                        put_page(page);
+                    put_page(page);
+                    ret = -EAGAIN;
+                    goto out;
+                case -EAGAIN:
+                    page->u.inuse.type_info |= PGT_partial;
+                    if ( x & PGT_partial )
+                        put_page(page);
+                    goto out;
+                default:
                     BUG();
+                }
+                if ( x & PGT_partial )
+                {
+                    page->u.inuse.type_info--;
+                    put_page(page);
+                }
                 break;
             }
         }
@@ -1831,11 +1846,6 @@ int domain_relinquish_resources(struct d
         /* fallthrough */
 
     case RELMEM_done:
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-        ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table);
-        if ( ret )
-            return ret;
-#endif
         break;
 
     default:
@@ -1891,6 +1901,54 @@ void domain_cpuid(
 
     *eax = *ebx = *ecx = *edx = 0;
 }
+
+void vcpu_kick(struct vcpu *v)
+{
+    /*
+     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
+     * pending flag. These values may fluctuate (after all, we hold no
+     * locks) but the key insight is that each change will cause
+     * evtchn_upcall_pending to be polled.
+     * 
+     * NB2. We save the running flag across the unblock to avoid a needless
+     * IPI for domains that we IPI'd to unblock.
+     */
+    bool_t running = v->is_running;
+    vcpu_unblock(v);
+    if ( running && (in_irq() || (v != current)) )
+        cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
+}
+
+void vcpu_mark_events_pending(struct vcpu *v)
+{
+    int already_pending = test_and_set_bit(
+        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
+
+    if ( already_pending )
+        return;
+
+    if ( is_hvm_vcpu(v) )
+        hvm_assert_evtchn_irq(v);
+    else
+        vcpu_kick(v);
+}
+
+static void vcpu_kick_softirq(void)
+{
+    /*
+     * Nothing to do here: we merely prevent notifiers from racing with checks
+     * executed on return to guest context with interrupts enabled. See, for
+     * example, xxx_intr_assist() executed on return to HVM guest context.
+     */
+}
+
+static int __init init_vcpu_kick_softirq(void)
+{
+    open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
+    return 0;
+}
+__initcall(init_vcpu_kick_softirq);
+
 
 /*
  * Local variables:
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/domain_build.c       Tue Nov 04 12:43:19 2008 +0900
@@ -194,6 +194,30 @@ static void __init process_dom0_ioports_
     }
 }
 
+/* We run on dom0's page tables for the final part of the build process. */
+static void dom0_pt_enter(struct vcpu *v)
+{
+    struct desc_ptr gdt_desc = {
+        .limit = LAST_RESERVED_GDT_BYTE,
+        .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)
+    };
+
+    asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+    write_ptbase(v);
+}
+
+/* Return to idle domain's page tables. */
+static void dom0_pt_exit(void)
+{
+    struct desc_ptr gdt_desc = {
+        .limit = LAST_RESERVED_GDT_BYTE,
+        .base = GDT_VIRT_START(current)
+    };
+
+    write_ptbase(current);
+    asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+}
+
 int __init construct_dom0(
     struct domain *d,
     unsigned long _image_start, unsigned long image_len, 
@@ -700,14 +724,12 @@ int __init construct_dom0(
         (void)alloc_vcpu(d, i, i % num_online_cpus());
 
     /* Set up CR3 value for write_ptbase */
-    if ( paging_mode_enabled(v->domain) )
+    if ( paging_mode_enabled(d) )
         paging_update_paging_modes(v);
     else
         update_cr3(v);
 
-    /* Install the new page tables. */
-    local_irq_disable();
-    write_ptbase(v);
+    dom0_pt_enter(v);
 
     /* Copy the OS image and free temporary buffer. */
     elf.dest = (void*)vkern_start;
@@ -804,9 +826,7 @@ int __init construct_dom0(
         xlat_start_info(si, XLAT_start_info_console_dom0);
 #endif
 
-    /* Reinstate the caller's page tables. */
-    write_ptbase(current);
-    local_irq_enable();
+    dom0_pt_exit();
 
 #if defined(__i386__)
     /* Destroy low mappings - they were only for our convenience. */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hpet.c       Tue Nov 04 12:43:19 2008 +0900
@@ -14,8 +14,6 @@
 #include <asm/div64.h>
 #include <asm/hpet.h>
 
-#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
-
 #define MAX_DELTA_NS MILLISECS(10*1000)
 #define MIN_DELTA_NS MICROSECS(20)
 
@@ -146,7 +144,7 @@ static void handle_hpet_broadcast(struct
     s_time_t now, next_event;
     int cpu;
 
-    spin_lock(&ch->lock);
+    spin_lock_irq(&ch->lock);
 
 again:
     ch->next_event = STIME_MAX;
@@ -171,7 +169,7 @@ again:
         if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
             goto again;
     }
-    spin_unlock(&ch->lock);
+    spin_unlock_irq(&ch->lock);
 }
 
 void hpet_broadcast_init(void)
@@ -213,6 +211,7 @@ void hpet_broadcast_enter(void)
 {
     struct hpet_event_channel *ch = &hpet_event;
 
+    ASSERT(!local_irq_is_enabled());
     spin_lock(&ch->lock);
 
     disable_APIC_timer();
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/emulate.c
--- a/xen/arch/x86/hvm/emulate.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/emulate.c        Tue Nov 04 12:43:19 2008 +0900
@@ -14,10 +14,38 @@
 #include <xen/lib.h>
 #include <xen/sched.h>
 #include <xen/paging.h>
+#include <xen/trace.h>
 #include <asm/event.h>
 #include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
+
+#define HVMTRACE_IO_ASSIST_WRITE 0x200
+static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
+{
+    unsigned int size, event;
+    unsigned char buffer[12];
+
+    if ( likely(!tb_init_done) )
+        return;
+
+    event = is_mmio ? TRC_HVM_MMIO_ASSIST : TRC_HVM_IO_ASSIST;
+    if ( !p->dir )
+        event |= HVMTRACE_IO_ASSIST_WRITE;
+
+    *(uint64_t *)buffer = p->addr;
+    size = (p->addr != (u32)p->addr) ? 8 : 4;
+    if ( size == 8 )
+        event |= TRC_64_FLAG;
+
+    if ( !p->data_is_ptr )
+    {
+        *(uint32_t *)&buffer[size] = p->data;
+        size += 4;
+    }
+
+    trace_var(event, 0/*!cycles*/, size, buffer);
+}
 
 static int hvmemul_do_io(
     int is_mmio, paddr_t addr, unsigned long *reps, int size,
@@ -110,6 +138,8 @@ static int hvmemul_do_io(
     p->df = df;
     p->data = value;
     p->io_count++;
+
+    hvmtrace_io_assist(is_mmio, p);
 
     if ( is_mmio )
     {
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/hpet.c
--- a/xen/arch/x86/hvm/hpet.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/hpet.c   Tue Nov 04 12:43:19 2008 +0900
@@ -76,6 +76,7 @@
         ~0ULL : (tick) * (h)->hpet_to_ns_scale) >> 10))
 
 #define timer_config(h, n)       (h->hpet.timers[n].config)
+#define timer_enabled(h, n)      (timer_config(h, n) & HPET_TN_ENABLE)
 #define timer_is_periodic(h, n)  (timer_config(h, n) & HPET_TN_PERIODIC)
 #define timer_is_32bit(h, n)     (timer_config(h, n) & HPET_TN_32BIT)
 #define hpet_enabled(h)          (h->hpet.config & HPET_CFG_ENABLE)
@@ -88,9 +89,40 @@
     ((timer_config(h, n) & HPET_TN_INT_ROUTE_CAP_MASK) \
         >> HPET_TN_INT_ROUTE_CAP_SHIFT)
 
-#define hpet_time_after(a, b)   ((int32_t)(b) - (int32_t)(a) < 0)
-#define hpet_time_after64(a, b) ((int64_t)(b) - (int64_t)(a) < 0)
-
+static inline uint64_t hpet_read_maincounter(HPETState *h)
+{
+    ASSERT(spin_is_locked(&h->lock));
+
+    if ( hpet_enabled(h) )
+        return guest_time_hpet(h->vcpu) + h->mc_offset;
+    else 
+        return h->hpet.mc64;
+}
+
+static uint64_t hpet_get_comparator(HPETState *h, unsigned int tn)
+{
+    uint64_t comparator;
+    uint64_t elapsed;
+
+    comparator = h->hpet.comparator64[tn];
+    if ( timer_is_periodic(h, tn) )
+    {
+        /* update comparator by number of periods elapsed since last update */
+        uint64_t period = h->hpet.period[tn];
+        if (period)
+        {
+            elapsed = hpet_read_maincounter(h) + period - 1 - comparator;
+            comparator += (elapsed / period) * period;
+            h->hpet.comparator64[tn] = comparator;
+        }
+    }
+    
+    /* truncate if timer is in 32 bit mode */
+    if ( timer_is_32bit(h, tn) )
+        comparator = (uint32_t)comparator;
+    h->hpet.timers[tn].cmp = comparator;
+    return comparator;
+}
 static inline uint64_t hpet_read64(HPETState *h, unsigned long addr)
 {
     addr &= ~7;
@@ -104,7 +136,7 @@ static inline uint64_t hpet_read64(HPETS
     case HPET_STATUS:
         return h->hpet.isr;
     case HPET_COUNTER:
-        return h->hpet.mc64;
+        return hpet_read_maincounter(h);
     case HPET_T0_CFG:
     case HPET_T1_CFG:
     case HPET_T2_CFG:
@@ -112,7 +144,7 @@ static inline uint64_t hpet_read64(HPETS
     case HPET_T0_CMP:
     case HPET_T1_CMP:
     case HPET_T2_CMP:
-        return h->hpet.timers[(addr - HPET_T0_CMP) >> 5].cmp;
+        return hpet_get_comparator(h, (addr - HPET_T0_CMP) >> 5);
     case HPET_T0_ROUTE:
     case HPET_T1_ROUTE:
     case HPET_T2_ROUTE:
@@ -140,16 +172,6 @@ static inline int hpet_check_access_leng
     return 0;
 }
 
-static inline uint64_t hpet_read_maincounter(HPETState *h)
-{
-    ASSERT(spin_is_locked(&h->lock));
-
-    if ( hpet_enabled(h) )
-        return guest_time_hpet(h->vcpu) + h->mc_offset;
-    else 
-        return h->hpet.mc64;
-}
-
 static int hpet_read(
     struct vcpu *v, unsigned long addr, unsigned long length,
     unsigned long *pval)
@@ -169,8 +191,6 @@ static int hpet_read(
     spin_lock(&h->lock);
 
     val = hpet_read64(h, addr);
-    if ( (addr & ~7) == HPET_COUNTER )
-        val = hpet_read_maincounter(h);
 
     result = val;
     if ( length != 8 )
@@ -187,7 +207,10 @@ static void hpet_stop_timer(HPETState *h
 {
     ASSERT(tn < HPET_TIMER_NUM);
     ASSERT(spin_is_locked(&h->lock));
-    stop_timer(&h->timers[tn]);
+    destroy_periodic_time(&h->pt[tn]);
+    /* read the comparator to get it updated so a read while stopped will
+     * return the expected value. */
+    hpet_get_comparator(h, tn);
 }
 
 /* the number of HPET tick that stands for
@@ -197,6 +220,8 @@ static void hpet_set_timer(HPETState *h,
 static void hpet_set_timer(HPETState *h, unsigned int tn)
 {
     uint64_t tn_cmp, cur_tick, diff;
+    unsigned int irq;
+    unsigned int oneshot;
 
     ASSERT(tn < HPET_TIMER_NUM);
     ASSERT(spin_is_locked(&h->lock));
@@ -209,7 +234,10 @@ static void hpet_set_timer(HPETState *h,
         pit_stop_channel0_irq(pit);
     }
 
-    tn_cmp   = h->hpet.timers[tn].cmp;
+    if ( !timer_enabled(h, tn) )
+        return;
+
+    tn_cmp   = hpet_get_comparator(h, tn);
     cur_tick = hpet_read_maincounter(h);
     if ( timer_is_32bit(h, tn) )
     {
@@ -229,7 +257,25 @@ static void hpet_set_timer(HPETState *h,
         diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
             ? (uint32_t)diff : 0;
 
-    set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, diff));
+    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
+        /* if LegacyReplacementRoute bit is set, HPET specification requires
+           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
+           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
+        irq = (tn == 0) ? 0 : 8;
+    else
+        irq = timer_int_route(h, tn);
+
+    /*
+     * diff is the time from now when the timer should fire, for a periodic 
+     * timer we also need the period which may be different because time may
+     * have elapsed between the time the comparator was written and the timer
+     * being enabled (now).
+     */
+    oneshot = !timer_is_periodic(h, tn);
+    create_periodic_time(h->vcpu, &h->pt[tn],
+                         hpet_tick_to_ns(h, diff),
+                         oneshot ? 0 : hpet_tick_to_ns(h, h->hpet.period[tn]),
+                         irq, NULL, NULL);
 }
 
 static inline uint64_t hpet_fixup_reg(
@@ -248,6 +294,13 @@ static int hpet_write(
     uint64_t old_val, new_val;
     int tn, i;
 
+    /* Acculumate a bit mask of timers whos state is changed by this write. */
+    unsigned long start_timers = 0;
+    unsigned long stop_timers  = 0;
+#define set_stop_timer(n)    (__set_bit((n), &stop_timers))
+#define set_start_timer(n)   (__set_bit((n), &start_timers))
+#define set_restart_timer(n) (set_stop_timer(n),set_start_timer(n))
+
     addr &= HPET_MMAP_SIZE-1;
 
     if ( hpet_check_access_length(addr, length) != 0 )
@@ -256,9 +309,6 @@ static int hpet_write(
     spin_lock(&h->lock);
 
     old_val = hpet_read64(h, addr);
-    if ( (addr & ~7) == HPET_COUNTER )
-        old_val = hpet_read_maincounter(h);
-
     new_val = val;
     if ( length != 8 )
         new_val = hpet_fixup_reg(
@@ -275,22 +325,35 @@ static int hpet_write(
             /* Enable main counter and interrupt generation. */
             h->mc_offset = h->hpet.mc64 - guest_time_hpet(h->vcpu);
             for ( i = 0; i < HPET_TIMER_NUM; i++ )
-                hpet_set_timer(h, i); 
+            {
+                h->hpet.comparator64[i] =
+                            h->hpet.timers[i].config & HPET_TN_32BIT ?
+                                          (uint32_t)h->hpet.timers[i].cmp :
+                                                    h->hpet.timers[i].cmp;
+                if ( timer_enabled(h, i) )
+                    set_start_timer(i);
+            }
         }
         else if ( (old_val & HPET_CFG_ENABLE) && !(new_val & HPET_CFG_ENABLE) )
         {
             /* Halt main counter and disable interrupt generation. */
             h->hpet.mc64 = h->mc_offset + guest_time_hpet(h->vcpu);
             for ( i = 0; i < HPET_TIMER_NUM; i++ )
-                hpet_stop_timer(h, i);
+                if ( timer_enabled(h, i) )
+                    set_stop_timer(i);
         }
         break;
 
     case HPET_COUNTER:
+        h->hpet.mc64 = new_val;
         if ( hpet_enabled(h) )
+        {
             gdprintk(XENLOG_WARNING, 
                      "HPET: writing main counter but it's not halted!\n");
-        h->hpet.mc64 = new_val;
+            for ( i = 0; i < HPET_TIMER_NUM; i++ )
+                if ( timer_enabled(h, i) )
+                    set_restart_timer(i);
+        }
         break;
 
     case HPET_T0_CFG:
@@ -313,7 +376,28 @@ static int hpet_write(
             h->hpet.timers[tn].cmp = (uint32_t)h->hpet.timers[tn].cmp;
             h->hpet.period[tn] = (uint32_t)h->hpet.period[tn];
         }
-
+        if ( hpet_enabled(h) )
+        {
+            if ( new_val & HPET_TN_ENABLE )
+            {
+                if ( (new_val ^ old_val) & HPET_TN_PERIODIC )
+                    /* timer is enabled but switching mode to/from periodic/
+                     * one-shot, stop and restart the vpt timer to get it in
+                     * the right mode. */
+                    set_restart_timer(tn);
+                else if ( (new_val & HPET_TN_32BIT) &&
+                         !(old_val & HPET_TN_32BIT) )
+                    /* switching from 64 bit to 32 bit mode could cause timer
+                     * next fire time, or period, to change. */
+                    set_restart_timer(tn);
+                else if ( !(old_val & HPET_TN_ENABLE) )
+                    /* transition from timer disabled to timer enabled. */
+                    set_start_timer(tn);
+            }
+            else if ( old_val & HPET_TN_ENABLE )
+                /* transition from timer enabled to timer disabled. */
+                set_stop_timer(tn);
+        }
         break;
 
     case HPET_T0_CMP:
@@ -322,24 +406,32 @@ static int hpet_write(
         tn = (addr - HPET_T0_CMP) >> 5;
         if ( timer_is_32bit(h, tn) )
             new_val = (uint32_t)new_val;
-        if ( !timer_is_periodic(h, tn) ||
-             (h->hpet.timers[tn].config & HPET_TN_SETVAL) )
-            h->hpet.timers[tn].cmp = new_val;
-        else
+        h->hpet.timers[tn].cmp = new_val;
+        if ( h->hpet.timers[tn].config & HPET_TN_SETVAL )
+            /*
+             * When SETVAL is one, software is able to "directly set a periodic
+             * timer's accumulator."  That is, set the comparator without
+             * adjusting the period.  Much the same as just setting the
+             * comparator on an enabled one-shot timer.
+             * 
+             * This configuration bit clears when the comparator is written.
+             */
+            h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
+        else if ( timer_is_periodic(h, tn) )
         {
             /*
              * Clamp period to reasonable min/max values:
-             *  - minimum is 900us, same as timers controlled by vpt.c
+             *  - minimum is 100us, same as timers controlled by vpt.c
              *  - maximum is to prevent overflow in time_after() calculations
              */
-            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(900) )
-                new_val = (MICROSECS(900) << 10) / h->hpet_to_ns_scale;
+            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(100) )
+                new_val = (MICROSECS(100) << 10) / h->hpet_to_ns_scale;
             new_val &= (timer_is_32bit(h, tn) ? ~0u : ~0ull) >> 1;
             h->hpet.period[tn] = new_val;
         }
-        h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
-        if ( hpet_enabled(h) )
-            hpet_set_timer(h, tn);
+        h->hpet.comparator64[tn] = new_val;
+        if ( hpet_enabled(h) && timer_enabled(h, tn) )
+            set_restart_timer(tn);
         break;
 
     case HPET_T0_ROUTE:
@@ -354,6 +446,25 @@ static int hpet_write(
         break;
     }
 
+    /* stop/start timers whos state was changed by this write. */
+    while (stop_timers)
+    {
+        i = find_first_set_bit(stop_timers);
+        __clear_bit(i, &stop_timers);
+        hpet_stop_timer(h, i);
+    }
+
+    while (start_timers)
+    {
+        i = find_first_set_bit(start_timers);
+        __clear_bit(i, &start_timers);
+        hpet_set_timer(h, i);
+    }
+
+#undef set_stop_timer
+#undef set_start_timer
+#undef set_restart_timer
+
     spin_unlock(&h->lock);
 
  out:
@@ -373,86 +484,6 @@ struct hvm_mmio_handler hpet_mmio_handle
     .write_handler = hpet_write
 };
 
-static void hpet_route_interrupt(HPETState *h, unsigned int tn)
-{
-    unsigned int tn_int_route = timer_int_route(h, tn);
-    struct domain *d = h->vcpu->domain;
-
-    ASSERT(spin_is_locked(&h->lock));
-
-    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
-    {
-        /* if LegacyReplacementRoute bit is set, HPET specification requires
-           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
-           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
-        int isa_irq = (tn == 0) ? 0 : 8;
-        hvm_isa_irq_deassert(d, isa_irq);
-        hvm_isa_irq_assert(d, isa_irq);
-        return;
-    }
-
-    if ( !(timer_int_route_cap(h, tn) & (1U << tn_int_route)) )
-    {
-        gdprintk(XENLOG_ERR,
-                 "HPET: timer%u: invalid interrupt route config\n", tn);
-        domain_crash(d);
-        return;
-    }
-
-    /* We support only edge-triggered interrupt. */
-    spin_lock(&d->arch.hvm_domain.irq_lock);
-    vioapic_irq_positive_edge(d, tn_int_route);
-    spin_unlock(&d->arch.hvm_domain.irq_lock);
-}
-
-static void hpet_timer_fn(void *opaque)
-{
-    struct HPET_timer_fn_info *htfi = opaque;
-    HPETState *h = htfi->hs;
-    unsigned int tn = htfi->tn;
-
-    spin_lock(&h->lock);
-
-    if ( !hpet_enabled(h) )
-    {
-        spin_unlock(&h->lock);
-        return;
-    }
-
-    if ( timer_config(h, tn) & HPET_TN_ENABLE )
-        hpet_route_interrupt(h, tn);
-
-    if ( timer_is_periodic(h, tn) && (h->hpet.period[tn] != 0) )
-    {
-        uint64_t mc = hpet_read_maincounter(h), period = h->hpet.period[tn];
-        if ( timer_is_32bit(h, tn) )
-        {
-            while ( hpet_time_after(mc, h->hpet.timers[tn].cmp) )
-                h->hpet.timers[tn].cmp = (uint32_t)(
-                    h->hpet.timers[tn].cmp + period);
-        }
-        else
-        {
-            while ( hpet_time_after64(mc, h->hpet.timers[tn].cmp) )
-                h->hpet.timers[tn].cmp += period;
-        }
-        set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, period));
-    }
-
-    spin_unlock(&h->lock);
-}
-
-void hpet_migrate_timers(struct vcpu *v)
-{
-    struct HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
-    int i;
-
-    if ( v != h->vcpu )
-        return;
-
-    for ( i = 0; i < HPET_TIMER_NUM; i++ )
-        migrate_timer(&h->timers[i], v->processor);
-}
 
 static int hpet_save(struct domain *d, hvm_domain_context_t *h)
 {
@@ -477,18 +508,20 @@ static int hpet_save(struct domain *d, h
         C(isr);
         C(mc64);
         C(timers[0].config);
-        C(timers[0].cmp);
         C(timers[0].fsb);
         C(timers[1].config);
-        C(timers[1].cmp);
         C(timers[1].fsb);
         C(timers[2].config);
-        C(timers[2].cmp);
         C(timers[2].fsb);
         C(period[0]);
         C(period[1]);
         C(period[2]);
 #undef C
+        /* save the 64 bit comparator in the 64 bit timer[n].cmp field
+         * regardless of whether or not the timer is in 32 bit mode. */
+        rec->timers[0].cmp = hp->hpet.comparator64[0];
+        rec->timers[1].cmp = hp->hpet.comparator64[1];
+        rec->timers[2].cmp = hp->hpet.comparator64[2];
     }
 
     spin_unlock(&hp->lock);
@@ -500,6 +533,7 @@ static int hpet_load(struct domain *d, h
 {
     HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet;
     struct hvm_hw_hpet *rec;
+    uint64_t cmp;
     int i;
 
     spin_lock(&hp->lock);
@@ -515,32 +549,38 @@ static int hpet_load(struct domain *d, h
     h->cur += HVM_SAVE_LENGTH(HPET);
 
 #define C(x) hp->hpet.x = rec->x
-        C(capability);
-        C(config);
-        C(isr);
-        C(mc64);
-        C(timers[0].config);
-        C(timers[0].cmp);
-        C(timers[0].fsb);
-        C(timers[1].config);
-        C(timers[1].cmp);
-        C(timers[1].fsb);
-        C(timers[2].config);
-        C(timers[2].cmp);
-        C(timers[2].fsb);
-        C(period[0]);
-        C(period[1]);
-        C(period[2]);
+    C(capability);
+    C(config);
+    C(isr);
+    C(mc64);
+    /* The following define will generate a compiler error if HPET_TIMER_NUM
+     * changes. This indicates an incompatability with previous saved state. */
+#define HPET_TIMER_NUM 3
+    for ( i = 0; i < HPET_TIMER_NUM; i++ )
+    {
+        C(timers[i].config);
+        C(timers[i].fsb);
+        C(period[i]);
+        /* restore the hidden 64 bit comparator and truncate the timer's
+         * visible comparator field if in 32 bit mode. */
+        cmp = rec->timers[i].cmp;
+        hp->hpet.comparator64[i] = cmp;
+        if ( timer_is_32bit(hp, i) )
+            cmp = (uint32_t)cmp;
+        hp->hpet.timers[i].cmp = cmp;
+    }
 #undef C
     
     /* Recalculate the offset between the main counter and guest time */
     hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp->vcpu);
-                
-    /* Restart the timers */
-    for ( i = 0; i < HPET_TIMER_NUM; i++ )
-        if ( hpet_enabled(hp) )
-            hpet_set_timer(hp, i);
-
+
+    /* restart all timers */
+
+    if ( hpet_enabled(hp) )
+        for ( i = 0; i < HPET_TIMER_NUM; i++ )
+            if ( timer_enabled(hp, i) )
+                hpet_set_timer(hp, i);
+ 
     spin_unlock(&hp->lock);
 
     return 0;
@@ -575,10 +615,7 @@ void hpet_init(struct vcpu *v)
         h->hpet.timers[i].config = 
             HPET_TN_INT_ROUTE_CAP | HPET_TN_SIZE_CAP | HPET_TN_PERIODIC_CAP;
         h->hpet.timers[i].cmp = ~0ULL;
-        h->timer_fn_info[i].hs = h;
-        h->timer_fn_info[i].tn = i;
-        init_timer(&h->timers[i], hpet_timer_fn, &h->timer_fn_info[i],
-                   v->processor);
+        h->pt[i].source = PTSRC_isa;
     }
 }
 
@@ -587,8 +624,14 @@ void hpet_deinit(struct domain *d)
     int i;
     HPETState *h = &d->arch.hvm_domain.pl_time.vhpet;
 
-    for ( i = 0; i < HPET_TIMER_NUM; i++ )
-        kill_timer(&h->timers[i]);
+    spin_lock(&h->lock);
+
+    if ( hpet_enabled(h) )
+        for ( i = 0; i < HPET_TIMER_NUM; i++ )
+            if ( timer_enabled(h, i) )
+                hpet_stop_timer(h, i);
+
+    spin_unlock(&h->lock);
 }
 
 void hpet_reset(struct domain *d)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/hvm.c    Tue Nov 04 12:43:19 2008 +0900
@@ -163,7 +163,6 @@ void hvm_migrate_timers(struct vcpu *v)
 void hvm_migrate_timers(struct vcpu *v)
 {
     rtc_migrate_timers(v);
-    hpet_migrate_timers(v);
     pt_migrate(v);
 }
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/i8254.c  Tue Nov 04 12:43:19 2008 +0900
@@ -213,13 +213,13 @@ static void pit_load_count(PITState *pit
     case 2:
     case 3:
         /* Periodic timer. */
-        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, 
+        create_periodic_time(v, &pit->pt0, period, period, 0, pit_time_fired, 
                              &pit->count_load_time[channel]);
         break;
     case 1:
     case 4:
         /* One-shot timer. */
-        create_periodic_time(v, &pit->pt0, period, 0, 1, pit_time_fired,
+        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired,
                              &pit->count_load_time[channel]);
         break;
     default:
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/rtc.c
--- a/xen/arch/x86/hvm/rtc.c    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/rtc.c    Tue Nov 04 12:43:19 2008 +0900
@@ -59,8 +59,8 @@ static void rtc_timer_update(RTCState *s
 
         period = 1 << (period_code - 1); /* period in 32 Khz cycles */
         period = DIV_ROUND((period * 1000000000ULL), 32768); /* period in ns */
-        create_periodic_time(v, &s->pt, period, RTC_IRQ,
-                             0, rtc_periodic_cb, s);
+        create_periodic_time(v, &s->pt, period, period, RTC_IRQ,
+                             rtc_periodic_cb, s);
     }
     else
     {
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/svm/entry.S
--- a/xen/arch/x86/hvm/svm/entry.S      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/svm/entry.S      Tue Nov 04 12:43:19 2008 +0900
@@ -57,6 +57,8 @@
 #endif
 
 ENTRY(svm_asm_do_resume)
+        call svm_intr_assist
+
         get_current(bx)
         CLGI
 
@@ -67,7 +69,6 @@ ENTRY(svm_asm_do_resume)
         jnz  .Lsvm_process_softirqs
 
         call svm_asid_handle_vmrun
-        call svm_intr_assist
 
         cmpb $0,addr_of(tb_init_done)
         jnz  .Lsvm_trace
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vlapic.c Tue Nov 04 12:43:19 2008 +0900
@@ -701,8 +701,9 @@ static int vlapic_write(struct vcpu *v, 
                             (uint32_t)val * vlapic->hw.timer_divisor;
 
         vlapic_set_reg(vlapic, APIC_TMICT, val);
-        create_periodic_time(current, &vlapic->pt, period, vlapic->pt.irq,
-                             !vlapic_lvtt_period(vlapic), vlapic_pt_cb,
+        create_periodic_time(current, &vlapic->pt, period, 
+                             vlapic_lvtt_period(vlapic) ? period : 0,
+                             vlapic->pt.irq, vlapic_pt_cb,
                              &vlapic->timer_last_update);
         vlapic->timer_last_update = vlapic->pt.last_plt_gtime;
 
@@ -861,8 +862,9 @@ static void lapic_rearm(struct vlapic *s
     period = ((uint64_t)APIC_BUS_CYCLE_NS *
               (uint32_t)tmict * s->hw.timer_divisor);
     s->pt.irq = vlapic_get_reg(s, APIC_LVTT) & APIC_VECTOR_MASK;
-    create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq,
-                         !vlapic_lvtt_period(s), vlapic_pt_cb,
+    create_periodic_time(vlapic_vcpu(s), &s->pt, period,
+                         vlapic_lvtt_period(s) ? period : 0,
+                         s->pt.irq, vlapic_pt_cb,
                          &s->timer_last_update);
     s->timer_last_update = s->pt.last_plt_gtime;
 }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vmx/entry.S
--- a/xen/arch/x86/hvm/vmx/entry.S      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/entry.S      Tue Nov 04 12:43:19 2008 +0900
@@ -122,6 +122,8 @@ vmx_asm_vmexit_handler:
 
 .globl vmx_asm_do_vmentry
 vmx_asm_do_vmentry:
+        call vmx_intr_assist
+
         get_current(bx)
         cli
 
@@ -130,8 +132,6 @@ vmx_asm_do_vmentry:
         lea  addr_of(irq_stat),r(dx)
         cmpl $0,(r(dx),r(ax),1)
         jnz  .Lvmx_process_softirqs
-
-        call vmx_intr_assist
 
         testb $0xff,VCPU_vmx_emul(r(bx))
         jnz  .Lvmx_goto_realmode
@@ -179,11 +179,13 @@ vmx_asm_do_vmentry:
 
 /*.Lvmx_resume:*/
         VMRESUME
+        sti
         call vm_resume_fail
         ud2
 
 .Lvmx_launch:
         VMLAUNCH
+        sti
         call vm_launch_fail
         ud2
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Tue Nov 04 12:43:19 2008 +0900
@@ -49,6 +49,7 @@
 #include <asm/hvm/vpt.h>
 #include <public/hvm/save.h>
 #include <asm/hvm/trace.h>
+#include <asm/xenoprof.h>
 
 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
 
@@ -132,6 +133,7 @@ static void vmx_vcpu_destroy(struct vcpu
 {
     vmx_destroy_vmcs(v);
     vpmu_destroy(v);
+    passive_domain_destroy(v);
 }
 
 #ifdef __x86_64__
@@ -1666,6 +1668,8 @@ static int vmx_msr_read_intercept(struct
     default:
         if ( vpmu_do_rdmsr(regs) )
             goto done;
+        if ( passive_domain_do_rdmsr(regs) )
+            goto done;
         switch ( long_mode_do_msr_read(regs) )
         {
             case HNDL_unhandled:
@@ -1860,6 +1864,8 @@ static int vmx_msr_write_intercept(struc
         goto gp_fault;
     default:
         if ( vpmu_do_wrmsr(regs) )
+            return X86EMUL_OKAY;
+        if ( passive_domain_do_wrmsr(regs) )
             return X86EMUL_OKAY;
 
         if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) ) 
@@ -1964,27 +1970,25 @@ static void ept_handle_violation(unsigne
 {
     unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
     struct domain *d = current->domain;
-    unsigned long gfn = gpa >> PAGE_SHIFT;
+    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
     mfn_t mfn;
     p2m_type_t t;
 
-    if ( unlikely(qualification & EPT_GAW_VIOLATION) )
-    {
-        gdprintk(XENLOG_ERR, "EPT violation: guest physical address %"PRIpaddr
-                 " exceeded its width limit.\n", gpa);
-        goto crash;
-    }
-
-    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_RSVD) ||
-         unlikely(gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD) )
-    {
-        gdprintk(XENLOG_ERR, "EPT violation: reserved bit or "
-                 "pdptr load violation.\n");
-        goto crash;
-    }
-
     mfn = gfn_to_mfn(d, gfn, &t);
-    if ( (t != p2m_ram_ro) && p2m_is_ram(t) && paging_mode_log_dirty(d) )
+
+    /* There are two legitimate reasons for taking an EPT violation. 
+     * One is a guest access to MMIO space. */
+    if ( gla_validity == EPT_GLA_VALIDITY_MATCH && p2m_is_mmio(t) )
+    {
+        handle_mmio();
+        return;
+    }
+
+    /* The other is log-dirty mode, writing to a read-only page */
+    if ( paging_mode_log_dirty(d)
+         && (gla_validity == EPT_GLA_VALIDITY_MATCH
+             || gla_validity == EPT_GLA_VALIDITY_GPT_WALK)
+         && p2m_is_ram(t) && (t != p2m_ram_ro) )
     {
         paging_mark_dirty(d, mfn_x(mfn));
         p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
@@ -1992,16 +1996,39 @@ static void ept_handle_violation(unsigne
         return;
     }
 
-    /* This can only happen in log-dirty mode, writing back A/D bits. */
-    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_GPT_WALK) )
-        goto crash;
-
-    ASSERT(gla_validity == EPT_GLA_VALIDITY_MATCH);
-    handle_mmio();
-
-    return;
-
- crash:
+    /* Everything else is an error. */
+    gla = __vmread(GUEST_LINEAR_ADDRESS);
+    gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
+             "gpa %#"PRIpaddr", mfn %#lx, type %i.\n", 
+             qualification, 
+             (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
+             (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
+             (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
+             (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
+             (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
+             (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
+             gpa, mfn_x(mfn), t);
+
+    if ( qualification & EPT_GAW_VIOLATION )
+        gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n", 
+                 9 * (unsigned) d->arch.hvm_domain.vmx.ept_control.gaw + 21);
+
+    switch ( gla_validity )
+    {
+    case EPT_GLA_VALIDITY_PDPTR_LOAD:
+        gdprintk(XENLOG_ERR, " --- PDPTR load failed\n"); 
+        break;
+    case EPT_GLA_VALIDITY_GPT_WALK:
+        gdprintk(XENLOG_ERR, " --- guest PT walk to %#lx failed\n", gla);
+        break;
+    case EPT_GLA_VALIDITY_RSVD:
+        gdprintk(XENLOG_ERR, " --- GLA_validity 2 (reserved)\n");
+        break;
+    case EPT_GLA_VALIDITY_MATCH:
+        gdprintk(XENLOG_ERR, " --- guest access to %#lx failed\n", gla);
+        break;
+    }
+
     domain_crash(d);
 }
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vmx/vpmu_core2.c
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c Tue Nov 04 12:43:19 2008 +0900
@@ -35,6 +35,26 @@
 #include <asm/hvm/vmx/vpmu.h>
 #include <asm/hvm/vmx/vpmu_core2.h>
 
+u32 core2_counters_msr[] =   {
+    MSR_CORE_PERF_FIXED_CTR0,
+    MSR_CORE_PERF_FIXED_CTR1,
+    MSR_CORE_PERF_FIXED_CTR2};
+
+/* Core 2 Non-architectual Performance Control MSRs. */
+u32 core2_ctrls_msr[] = {
+    MSR_CORE_PERF_FIXED_CTR_CTRL,
+    MSR_IA32_PEBS_ENABLE,
+    MSR_IA32_DS_AREA};
+
+struct pmumsr core2_counters = {
+    3,
+    core2_counters_msr
+};
+
+struct pmumsr core2_ctrls = {
+    3,
+    core2_ctrls_msr
+};
 static int arch_pmc_cnt;
 
 static int core2_get_pmc_count(void)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vpt.c
--- a/xen/arch/x86/hvm/vpt.c    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vpt.c    Tue Nov 04 12:43:19 2008 +0900
@@ -355,8 +355,8 @@ void pt_migrate(struct vcpu *v)
 }
 
 void create_periodic_time(
-    struct vcpu *v, struct periodic_time *pt, uint64_t period,
-    uint8_t irq, char one_shot, time_cb *cb, void *data)
+    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
+    uint64_t period, uint8_t irq, time_cb *cb, void *data)
 {
     ASSERT(pt->source != 0);
 
@@ -368,13 +368,13 @@ void create_periodic_time(
     pt->do_not_freeze = 0;
     pt->irq_issued = 0;
 
-    /* Periodic timer must be at least 0.9ms. */
-    if ( (period < 900000) && !one_shot )
+    /* Periodic timer must be at least 0.1ms. */
+    if ( (period < 100000) && period )
     {
         if ( !test_and_set_bool(pt->warned_timeout_too_short) )
             gdprintk(XENLOG_WARNING, "HVM_PlatformTime: program too "
                      "small period %"PRIu64"\n", period);
-        period = 900000;
+        period = 100000;
     }
 
     pt->period = period;
@@ -382,15 +382,15 @@ void create_periodic_time(
     pt->last_plt_gtime = hvm_get_guest_time(pt->vcpu);
     pt->irq = irq;
     pt->period_cycles = (u64)period;
-    pt->one_shot = one_shot;
-    pt->scheduled = NOW() + period;
+    pt->one_shot = !period;
+    pt->scheduled = NOW() + delta;
     /*
      * Offset LAPIC ticks from other timer ticks. Otherwise guests which use
      * LAPIC ticks for process accounting can see long sequences of process
      * ticks incorrectly accounted to interrupt processing.
      */
-    if ( pt->source == PTSRC_lapic )
-        pt->scheduled += period >> 1;
+    if ( !pt->one_shot && (pt->source == PTSRC_lapic) )
+        pt->scheduled += delta >> 1;
     pt->cb = cb;
     pt->priv = data;
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/irq.c        Tue Nov 04 12:43:19 2008 +0900
@@ -793,6 +793,10 @@ int map_domain_pirq(
 
     ASSERT(spin_is_locked(&d->event_lock));
 
+    /* XXX Until pcidev and msi locking is fixed. */
+    if ( type == MAP_PIRQ_TYPE_MSI )
+        return -EINVAL;
+
     if ( !IS_PRIV(current->domain) )
         return -EPERM;
 
@@ -840,7 +844,7 @@ int map_domain_pirq(
     d->arch.pirq_vector[pirq] = vector;
     d->arch.vector_pirq[vector] = pirq;
 
-done:
+ done:
     spin_unlock_irqrestore(&desc->lock, flags);
     return ret;
 }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/mm.c Tue Nov 04 12:43:19 2008 +0900
@@ -566,19 +566,21 @@ static int get_page_and_type_from_pagenr
 static int get_page_and_type_from_pagenr(unsigned long page_nr, 
                                          unsigned long type,
                                          struct domain *d,
+                                         int partial,
                                          int preemptible)
 {
     struct page_info *page = mfn_to_page(page_nr);
     int rc;
 
-    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
+    if ( likely(partial >= 0) &&
+         unlikely(!get_page_from_pagenr(page_nr, d)) )
         return -EINVAL;
 
     rc = (preemptible ?
           get_page_type_preemptible(page, type) :
           (get_page_type(page, type) ? 0 : -EINVAL));
 
-    if ( rc )
+    if ( unlikely(rc) && partial >= 0 )
         put_page(page);
 
     return rc;
@@ -761,7 +763,7 @@ get_page_from_l2e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
+        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0);
     if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
         rc = 0;
 
@@ -772,7 +774,7 @@ define_get_linear_pagetable(l3);
 define_get_linear_pagetable(l3);
 static int
 get_page_from_l3e(
-    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
+    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int 
preemptible)
 {
     int rc;
 
@@ -786,7 +788,7 @@ get_page_from_l3e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
+        l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
     if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
         rc = 0;
 
@@ -797,7 +799,7 @@ define_get_linear_pagetable(l4);
 define_get_linear_pagetable(l4);
 static int
 get_page_from_l4e(
-    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
+    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int 
preemptible)
 {
     int rc;
 
@@ -811,7 +813,7 @@ get_page_from_l4e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
+        l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
     if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
         rc = 0;
 
@@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_
     return 1;
 }
 
+static int __put_page_type(struct page_info *, int preemptible);
 
 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
-                             int preemptible)
+                             int partial, int preemptible)
 {
     if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
          (l3e_get_pfn(l3e) != pfn) )
+    {
+        if ( unlikely(partial > 0) )
+            return __put_page_type(l3e_get_page(l3e), preemptible);
         return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+    }
     return 1;
 }
 
 #if CONFIG_PAGING_LEVELS >= 4
 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
-                             int preemptible)
+                             int partial, int preemptible)
 {
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
+    {
+        if ( unlikely(partial > 0) )
+            return __put_page_type(l4e_get_page(l4e), preemptible);
         return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+    }
     return 1;
 }
 #endif
@@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in
     unsigned long  pfn = page_to_mfn(page);
     l3_pgentry_t  *pl3e;
     unsigned int   i;
-    int            rc = 0;
+    int            rc = 0, partial = page->partial_pte;
 
 #if CONFIG_PAGING_LEVELS == 3
     /*
@@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in
     if ( is_pv_32on64_domain(d) )
         memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
 
-    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
+    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
     {
         if ( is_pv_32bit_domain(d) && (i == 3) )
         {
@@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in
                 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
                                                    PGT_l2_page_table |
                                                    PGT_pae_xen_l2,
-                                                   d, preemptible);
+                                                   d, partial, preemptible);
         }
         else if ( !is_guest_l3_slot(i) ||
-                  (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
+                  (rc = get_page_from_l3e(pl3e[i], pfn, d,
+                                          partial, preemptible)) > 0 )
             continue;
 
         if ( rc == -EAGAIN )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = 1;
+            page->partial_pte = partial ?: 1;
         }
         else if ( rc == -EINTR && i )
         {
@@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in
             if ( !is_guest_l3_slot(i) )
                 continue;
             unadjust_guest_l3e(pl3e[i], d);
-            put_page_from_l3e(pl3e[i], pfn, 0);
+            put_page_from_l3e(pl3e[i], pfn, 0, 0);
         }
     }
 
@@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in
     unsigned long  pfn = page_to_mfn(page);
     l4_pgentry_t  *pl4e = page_to_virt(page);
     unsigned int   i;
-    int            rc = 0;
-
-    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
+    int            rc = 0, partial = page->partial_pte;
+
+    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
     {
         if ( !is_guest_l4_slot(d, i) ||
-             (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
+             (rc = get_page_from_l4e(pl4e[i], pfn, d,
+                                     partial, preemptible)) > 0 )
             continue;
 
         if ( rc == -EAGAIN )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = 1;
+            page->partial_pte = partial ?: 1;
         }
         else if ( rc == -EINTR )
         {
@@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in
             MEM_LOG("Failure in alloc_l4_table: entry %d", i);
             while ( i-- > 0 )
                 if ( is_guest_l4_slot(d, i) )
-                    put_page_from_l4e(pl4e[i], pfn, 0);
+                    put_page_from_l4e(pl4e[i], pfn, 0, 0);
         }
         if ( rc < 0 )
             return rc;
@@ -1377,24 +1392,20 @@ static int free_l3_table(struct page_inf
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l3_pgentry_t *pl3e;
-    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
-    int rc = 0;
-
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-    if ( d->arch.relmem == RELMEM_l3 )
-        return 0;
-#endif
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
 
     pl3e = map_domain_page(pfn);
 
     do {
         if ( is_guest_l3_slot(i) )
         {
-            rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
+            rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
+            if ( rc < 0 )
+                break;
+            partial = 0;
             if ( rc > 0 )
                 continue;
-            if ( rc )
-                break;
             unadjust_guest_l3e(pl3e[i], d);
         }
     } while ( i-- );
@@ -1404,7 +1415,7 @@ static int free_l3_table(struct page_inf
     if ( rc == -EAGAIN )
     {
         page->nr_validated_ptes = i;
-        page->partial_pte = 1;
+        page->partial_pte = partial ?: -1;
     }
     else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
     {
@@ -1421,23 +1432,21 @@ static int free_l4_table(struct page_inf
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l4_pgentry_t *pl4e = page_to_virt(page);
-    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
-    int rc = 0;
-
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-    if ( d->arch.relmem == RELMEM_l4 )
-        return 0;
-#endif
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
 
     do {
         if ( is_guest_l4_slot(d, i) )
-            rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
-    } while ( rc >= 0 && i-- );
+            rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
+        if ( rc < 0 )
+            break;
+        partial = 0;
+    } while ( i-- );
 
     if ( rc == -EAGAIN )
     {
         page->nr_validated_ptes = i;
-        page->partial_pte = 1;
+        page->partial_pte = partial ?: -1;
     }
     else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
     {
@@ -1713,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
             return rc ? 0 : -EFAULT;
         }
 
-        rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
+        rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
         if ( unlikely(rc < 0) )
             return page_unlock(l3pg), rc;
         rc = 0;
@@ -1742,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
     }
 
     page_unlock(l3pg);
-    put_page_from_l3e(ol3e, pfn, 0);
+    put_page_from_l3e(ol3e, pfn, 0, 0);
     return rc;
 }
 
@@ -1791,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
             return rc ? 0 : -EFAULT;
         }
 
-        rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
+        rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
         if ( unlikely(rc < 0) )
             return page_unlock(l4pg), rc;
         rc = 0;
@@ -1812,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
     }
 
     page_unlock(l4pg);
-    put_page_from_l4e(ol4e, pfn, 0);
+    put_page_from_l4e(ol4e, pfn, 0, 0);
     return rc;
 }
 
@@ -1847,7 +1856,8 @@ int get_page(struct page_info *page, str
         nx = x + 1;
         d  = nd;
         if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
-             unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
+             /* Keep one spare reference to be acquired by get_page_light(). */
+             unlikely(((nx + 1) & PGC_count_mask) <= 1) || /* Overflow? */
              unlikely(d != _domain) )                /* Wrong owner? */
         {
             if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
@@ -1867,6 +1877,28 @@ int get_page(struct page_info *page, str
     while ( unlikely(nd != d) || unlikely(y != x) );
 
     return 1;
+}
+
+/*
+ * Special version of get_page() to be used exclusively when
+ * - a page is known to already have a non-zero reference count
+ * - the page does not need its owner to be checked
+ * - it will not be called more than once without dropping the thus
+ *   acquired reference again.
+ * Due to get_page() reserving one reference, this call cannot fail.
+ */
+static void get_page_light(struct page_info *page)
+{
+    u32 x, nx, y = page->count_info;
+
+    do {
+        x  = y;
+        nx = x + 1;
+        BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
+        BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
+        y = cmpxchg(&page->count_info, x, nx);
+    }
+    while ( unlikely(y != x) );
 }
 
 
@@ -1909,6 +1941,7 @@ static int alloc_page_type(struct page_i
     wmb();
     if ( rc == -EAGAIN )
     {
+        get_page_light(page);
         page->u.inuse.type_info |= PGT_partial;
     }
     else if ( rc == -EINTR )
@@ -1973,6 +2006,7 @@ int free_page_type(struct page_info *pag
         page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
         page->partial_pte = 0;
     }
+
     switch ( type & PGT_type_mask )
     {
     case PGT_l1_page_table:
@@ -1998,6 +2032,15 @@ int free_page_type(struct page_info *pag
         BUG();
     }
 
+    return rc;
+}
+
+
+static int __put_final_page_type(
+    struct page_info *page, unsigned long type, int preemptible)
+{
+    int rc = free_page_type(page, type, preemptible);
+
     /* No need for atomic update of type_info here: noone else updates it. */
     if ( rc == 0 )
     {
@@ -2016,8 +2059,8 @@ int free_page_type(struct page_info *pag
     }
     else if ( rc == -EINTR )
     {
-        ASSERT(!(page->u.inuse.type_info &
-                 (PGT_count_mask|PGT_validated|PGT_partial)));
+        ASSERT((page->u.inuse.type_info &
+                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
         if ( !(shadow_mode_enabled(page_get_owner(page)) &&
                (page->count_info & PGC_page_table)) )
             page->tlbflush_timestamp = tlbflush_current_time();
@@ -2028,6 +2071,7 @@ int free_page_type(struct page_info *pag
     {
         BUG_ON(rc != -EAGAIN);
         wmb();
+        get_page_light(page);
         page->u.inuse.type_info |= PGT_partial;
     }
 
@@ -2039,6 +2083,7 @@ static int __put_page_type(struct page_i
                            int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
+    int rc = 0;
 
     for ( ; ; )
     {
@@ -2062,7 +2107,10 @@ static int __put_page_type(struct page_i
                                            x, nx)) != x) )
                     continue;
                 /* We cleared the 'valid bit' so we do the clean up. */
-                return free_page_type(page, x, preemptible);
+                rc = __put_final_page_type(page, x, preemptible);
+                if ( x & PGT_partial )
+                    put_page(page);
+                break;
             }
 
             /*
@@ -2084,7 +2132,7 @@ static int __put_page_type(struct page_i
             return -EINTR;
     }
 
-    return 0;
+    return rc;
 }
 
 
@@ -2092,6 +2140,7 @@ static int __get_page_type(struct page_i
                            int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
+    int rc = 0;
 
     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
 
@@ -2214,10 +2263,13 @@ static int __get_page_type(struct page_i
             page->nr_validated_ptes = 0;
             page->partial_pte = 0;
         }
-        return alloc_page_type(page, type, preemptible);
-    }
-
-    return 0;
+        rc = alloc_page_type(page, type, preemptible);
+    }
+
+    if ( (x & PGT_partial) && !(nx & PGT_partial) )
+        put_page(page);
+
+    return rc;
 }
 
 void put_page_type(struct page_info *page)
@@ -2296,7 +2348,7 @@ int new_guest_cr3(unsigned long mfn)
 #endif
     okay = paging_mode_refcounts(d)
         ? get_page_from_pagenr(mfn, d)
-        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
+        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
     if ( unlikely(!okay) )
     {
         MEM_LOG("Error while installing new baseptr %lx", mfn);
@@ -2431,6 +2483,29 @@ static inline cpumask_t vcpumask_to_pcpu
     return pmask;
 }
 
+#ifdef __i386__
+static inline void *fixmap_domain_page(unsigned long mfn)
+{
+    unsigned int cpu = smp_processor_id();
+    void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
+
+    l1e_write(fix_pae_highmem_pl1e - cpu,
+              l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
+    flush_tlb_one_local(ptr);
+    return ptr;
+}
+static inline void fixunmap_domain_page(const void *ptr)
+{
+    unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
+
+    l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
+    this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
+}
+#else
+#define fixmap_domain_page(mfn) mfn_to_virt(mfn)
+#define fixunmap_domain_page(ptr) ((void)(ptr))
+#endif
+
 int do_mmuext_op(
     XEN_GUEST_HANDLE(mmuext_op_t) uops,
     unsigned int count,
@@ -2517,7 +2592,7 @@ int do_mmuext_op(
             if ( paging_mode_refcounts(FOREIGNDOM) )
                 break;
 
-            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
+            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
             okay = !rc;
             if ( unlikely(!okay) )
             {
@@ -2598,7 +2673,7 @@ int do_mmuext_op(
                     okay = get_page_from_pagenr(mfn, d);
                 else
                     okay = !get_page_and_type_from_pagenr(
-                        mfn, PGT_root_page_table, d, 0);
+                        mfn, PGT_root_page_table, d, 0, 0);
                 if ( unlikely(!okay) )
                 {
                     MEM_LOG("Error while installing new mfn %lx", mfn);
@@ -2697,6 +2772,66 @@ int do_mmuext_op(
                 if ( ents != 0 )
                     this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
             }
+            break;
+        }
+
+        case MMUEXT_CLEAR_PAGE:
+        {
+            unsigned char *ptr;
+
+            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
+                                                  FOREIGNDOM, 0, 0);
+            if ( unlikely(!okay) )
+            {
+                MEM_LOG("Error while clearing mfn %lx", mfn);
+                break;
+            }
+
+            /* A page is dirtied when it's being cleared. */
+            paging_mark_dirty(d, mfn);
+
+            ptr = fixmap_domain_page(mfn);
+            clear_page(ptr);
+            fixunmap_domain_page(ptr);
+
+            put_page_and_type(page);
+            break;
+        }
+
+        case MMUEXT_COPY_PAGE:
+        {
+            const unsigned char *src;
+            unsigned char *dst;
+            unsigned long src_mfn;
+
+            src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
+            okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
+            if ( unlikely(!okay) )
+            {
+                MEM_LOG("Error while copying from mfn %lx", src_mfn);
+                break;
+            }
+
+            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
+                                                  FOREIGNDOM, 0, 0);
+            if ( unlikely(!okay) )
+            {
+                put_page(mfn_to_page(src_mfn));
+                MEM_LOG("Error while copying to mfn %lx", mfn);
+                break;
+            }
+
+            /* A page is dirtied when it's being copied to. */
+            paging_mark_dirty(d, mfn);
+
+            src = map_domain_page(src_mfn);
+            dst = fixmap_domain_page(mfn);
+            copy_page(dst, src);
+            fixunmap_domain_page(dst);
+            unmap_domain_page(src);
+
+            put_page_and_type(page);
+            put_page(mfn_to_page(src_mfn));
             break;
         }
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/mm/hap/p2m-ept.c     Tue Nov 04 12:43:19 2008 +0900
@@ -157,9 +157,6 @@ ept_set_entry(struct domain *d, unsigned
     {
         if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
         {
-            /* Track the highest gfn for which we have ever had a valid 
mapping */
-            if ( gfn > d->arch.p2m->max_mapped_pfn )
-                d->arch.p2m->max_mapped_pfn = gfn;
             ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn));
             ept_entry->sp_avail = walk_level ? 1 : 0;
 
@@ -233,6 +230,11 @@ ept_set_entry(struct domain *d, unsigned
 
         unmap_domain_page(split_table);
     }
+
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn_x(mfn))
+         && (gfn + (1UL << order) - 1 > d->arch.p2m->max_mapped_pfn) )
+        d->arch.p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
 
     /* Success */
     rv = 1;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/mm/p2m.c     Tue Nov 04 12:43:19 2008 +0900
@@ -322,7 +322,8 @@ p2m_set_entry(struct domain *d, unsigned
     }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
+    if ( mfn_valid(mfn) 
+         && (gfn + (1UL << page_order) - 1 > d->arch.p2m->max_mapped_pfn) )
         d->arch.p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
 
     if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) )
@@ -956,18 +957,18 @@ guest_physmap_add_entry(struct domain *d
     /* First, remove m->p mappings for existing p->m mappings */
     for ( i = 0; i < (1UL << page_order); i++ )
     {
-        omfn = gfn_to_mfn(d, gfn, &ot);
+        omfn = gfn_to_mfn(d, gfn + i, &ot);
         if ( p2m_is_ram(ot) )
         {
             ASSERT(mfn_valid(omfn));
-            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
+            set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
         }
     }
 
     /* Then, look for m->p mappings for this range and deal with them */
     for ( i = 0; i < (1UL << page_order); i++ )
     {
-        ogfn = mfn_to_gfn(d, _mfn(mfn));
+        ogfn = mfn_to_gfn(d, _mfn(mfn+i));
         if (
 #ifdef __x86_64__
             (ogfn != 0x5555555555555555L)
@@ -975,20 +976,20 @@ guest_physmap_add_entry(struct domain *d
             (ogfn != 0x55555555L)
 #endif
             && (ogfn != INVALID_M2P_ENTRY)
-            && (ogfn != gfn) )
+            && (ogfn != gfn + i) )
         {
             /* This machine frame is already mapped at another physical
              * address */
             P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
-                      mfn, ogfn, gfn);
+                      mfn + i, ogfn, gfn + i);
             omfn = gfn_to_mfn(d, ogfn, &ot);
             if ( p2m_is_ram(ot) )
             {
                 ASSERT(mfn_valid(omfn));
                 P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
                           ogfn , mfn_x(omfn));
-                if ( mfn_x(omfn) == mfn )
-                    p2m_remove_page(d, ogfn, mfn, 0);
+                if ( mfn_x(omfn) == (mfn + i) )
+                    p2m_remove_page(d, ogfn, mfn + i, 0);
             }
         }
     }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/msi.c
--- a/xen/arch/x86/msi.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/msi.c        Tue Nov 04 12:43:19 2008 +0900
@@ -33,8 +33,7 @@ DECLARE_BITMAP(msix_fixmap_pages, MAX_MS
 
 static int msix_fixmap_alloc(void)
 {
-    int i;
-    int rc = -1;
+    int i, rc = -1;
 
     spin_lock(&msix_fixmap_lock);
     for ( i = 0; i < MAX_MSIX_PAGES; i++ )
@@ -52,12 +51,8 @@ static int msix_fixmap_alloc(void)
 
 static void msix_fixmap_free(int idx)
 {
-    if ( idx < FIX_MSIX_IO_RESERV_BASE )
-        return;
-
-    spin_lock(&msix_fixmap_lock);
-    clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
-    spin_unlock(&msix_fixmap_lock);
+    if ( idx >= FIX_MSIX_IO_RESERV_BASE )
+        clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
 }
 
 /*
@@ -78,19 +73,19 @@ static void msi_compose_msg(struct pci_d
         msg->address_lo =
             MSI_ADDR_BASE_LO |
             ((INT_DEST_MODE == 0) ?
-                MSI_ADDR_DESTMODE_PHYS:
-                MSI_ADDR_DESTMODE_LOGIC) |
+             MSI_ADDR_DESTMODE_PHYS:
+             MSI_ADDR_DESTMODE_LOGIC) |
             ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                MSI_ADDR_REDIRECTION_CPU:
-                MSI_ADDR_REDIRECTION_LOWPRI) |
+             MSI_ADDR_REDIRECTION_CPU:
+             MSI_ADDR_REDIRECTION_LOWPRI) |
             MSI_ADDR_DEST_ID(dest);
 
         msg->data =
             MSI_DATA_TRIGGER_EDGE |
             MSI_DATA_LEVEL_ASSERT |
             ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                MSI_DATA_DELIVERY_FIXED:
-                MSI_DATA_DELIVERY_LOWPRI) |
+             MSI_DATA_DELIVERY_FIXED:
+             MSI_DATA_DELIVERY_LOWPRI) |
             MSI_DATA_VECTOR(vector);
     }
 }
@@ -128,7 +123,7 @@ static void read_msi_msg(struct msi_desc
     {
         void __iomem *base;
         base = entry->mask_base +
-           entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+            entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 
         msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
@@ -205,9 +200,9 @@ static void write_msi_msg(struct msi_des
             entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 
         writel(msg->address_lo,
-            base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+               base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         writel(msg->address_hi,
-            base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+               base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
         writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
         break;
     }
@@ -230,7 +225,7 @@ void set_msi_irq_affinity(unsigned int i
     dest = cpu_mask_to_apicid(mask);
 
     if ( !desc )
-       return;
+        return;
 
     ASSERT(spin_is_locked(&irq_desc[irq].lock));
     spin_lock(&desc->dev->lock);
@@ -398,8 +393,8 @@ static void msi_free_vector(int vector)
         unsigned long start;
 
         writel(1, entry->mask_base + entry->msi_attrib.entry_nr
-              * PCI_MSIX_ENTRY_SIZE
-              + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+               * PCI_MSIX_ENTRY_SIZE
+               + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
 
         start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
         msix_fixmap_free(virt_to_fix(start));
@@ -460,20 +455,20 @@ static int msi_capability_init(struct pc
     entry->vector = vector;
     if ( is_mask_bit_support(control) )
         entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos,
-                is_64bit_address(control));
+                                                                   
is_64bit_address(control));
     entry->dev = dev;
     if ( entry->msi_attrib.maskbit )
     {
         unsigned int maskbits, temp;
         /* All MSIs are unmasked by default, Mask them all */
         maskbits = pci_conf_read32(bus, slot, func,
-                       msi_mask_bits_reg(pos, is_64bit_address(control)));
+                                   msi_mask_bits_reg(pos, 
is_64bit_address(control)));
         temp = (1 << multi_msi_capable(control));
         temp = ((temp - 1) & ~temp);
         maskbits |= temp;
         pci_conf_write32(bus, slot, func,
-            msi_mask_bits_reg(pos, is_64bit_address(control)),
-            maskbits);
+                         msi_mask_bits_reg(pos, is_64bit_address(control)),
+                         maskbits);
     }
     list_add_tail(&entry->list, &dev->msi_list);
 
@@ -575,14 +570,14 @@ static int __pci_enable_msi(struct msi_i
 
     pdev = pci_lock_pdev(msi->bus, msi->devfn);
     if ( !pdev )
-       return -ENODEV;
+        return -ENODEV;
 
     if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSI) )
     {
-       spin_unlock(&pdev->lock);
+        spin_unlock(&pdev->lock);
         dprintk(XENLOG_WARNING, "vector %d has already mapped to MSI on "
-            "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
-            PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+                "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
+                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         return 0;
     }
 
@@ -601,7 +596,7 @@ static void __pci_disable_msi(int vector
 
     entry = irq_desc[vector].msi_desc;
     if ( !entry )
-       return;
+        return;
     /*
      * Lock here is safe.  msi_desc can not be removed without holding
      * both irq_desc[].lock (which we do) and pdev->lock.
@@ -649,20 +644,20 @@ static int __pci_enable_msix(struct msi_
 
     pdev = pci_lock_pdev(msi->bus, msi->devfn);
     if ( !pdev )
-       return -ENODEV;
+        return -ENODEV;
 
     pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos));
     nr_entries = multi_msix_capable(control);
     if (msi->entry_nr > nr_entries)
     {
-       spin_unlock(&pdev->lock);
+        spin_unlock(&pdev->lock);
         return -EINVAL;
     }
 
     if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSIX) )
     {
-       spin_unlock(&pdev->lock);
+        spin_unlock(&pdev->lock);
         dprintk(XENLOG_WARNING, "vector %d has already mapped to MSIX on "
                 "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
                 PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
@@ -684,7 +679,7 @@ static void __pci_disable_msix(int vecto
 
     entry = irq_desc[vector].msi_desc;
     if ( !entry )
-       return;
+        return;
     /*
      * Lock here is safe.  msi_desc can not be removed without holding
      * both irq_desc[].lock (which we do) and pdev->lock.
@@ -712,7 +707,7 @@ int pci_enable_msi(struct msi_info *msi)
     ASSERT(spin_is_locked(&irq_desc[msi->vector].lock));
 
     return  msi->table_base ? __pci_enable_msix(msi) :
-                              __pci_enable_msi(msi);
+        __pci_enable_msi(msi);
 }
 
 void pci_disable_msi(int vector)
@@ -720,7 +715,7 @@ void pci_disable_msi(int vector)
     irq_desc_t *desc = &irq_desc[vector];
     ASSERT(spin_is_locked(&desc->lock));
     if ( !desc->msi_desc )
-       return;
+        return;
 
     if ( desc->msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
         __pci_disable_msi(vector);
@@ -734,7 +729,7 @@ static void msi_free_vectors(struct pci_
     irq_desc_t *desc;
     unsigned long flags;
 
-retry:
+ retry:
     list_for_each_entry_safe( entry, tmp, &dev->msi_list, list )
     {
         desc = &irq_desc[entry->vector];
@@ -742,7 +737,7 @@ retry:
         local_irq_save(flags);
         if ( !spin_trylock(&desc->lock) )
         {
-             local_irq_restore(flags);
+            local_irq_restore(flags);
             goto retry;
         }
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/oprofile/nmi_int.c
--- a/xen/arch/x86/oprofile/nmi_int.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/oprofile/nmi_int.c   Tue Nov 04 12:43:19 2008 +0900
@@ -36,6 +36,55 @@ static char *cpu_type;
 static char *cpu_type;
 
 extern int is_active(struct domain *d);
+extern int is_passive(struct domain *d);
+
+int passive_domain_do_rdmsr(struct cpu_user_regs *regs)
+{
+       u64 msr_content;
+       int type, index;
+       struct vpmu_struct *vpmu = vcpu_vpmu(current);
+
+       if ( model->is_arch_pmu_msr == NULL )
+               return 0;
+       if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) )
+               return 0;
+       if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
+               if ( ! model->allocated_msr(current) )
+                       return 0;
+
+       model->load_msr(current, type, index, &msr_content);
+       regs->eax = msr_content & 0xFFFFFFFF;
+       regs->edx = msr_content >> 32;
+       return 1;
+}
+
+
+int passive_domain_do_wrmsr(struct cpu_user_regs *regs)
+{
+       u64 msr_content;
+       int type, index;
+       struct vpmu_struct *vpmu = vcpu_vpmu(current);
+
+       if ( model->is_arch_pmu_msr == NULL )
+               return 0;
+       if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) )
+               return 0;
+
+       if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
+               if ( ! model->allocated_msr(current) )
+                       return 0;
+
+       msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
+       model->save_msr(current, type, index, msr_content);
+       return 1;
+}
+
+void passive_domain_destroy(struct vcpu *v)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(v);
+       if ( vpmu->flags & PASSIVE_DOMAIN_ALLOCATED )
+               model->free_msr(v);
+}
 
 static int nmi_callback(struct cpu_user_regs *regs, int cpu)
 {
@@ -46,6 +95,8 @@ static int nmi_callback(struct cpu_user_
        if ( ovf && is_active(current->domain) && !xen_mode )
                send_guest_vcpu_virq(current, VIRQ_XENOPROF);
 
+       if ( ovf == 2 ) 
+                test_and_set_bool(current->nmi_pending);
        return 1;
 }
  
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/oprofile/op_model_ppro.c
--- a/xen/arch/x86/oprofile/op_model_ppro.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/oprofile/op_model_ppro.c     Tue Nov 04 12:43:19 2008 +0900
@@ -18,6 +18,8 @@
 #include <xen/sched.h>
 #include <asm/regs.h>
 #include <asm/current.h>
+#include <asm/hvm/vmx/vpmu.h>
+#include <asm/hvm/vmx/vpmu_core2.h>
  
 #include "op_x86_model.h"
 #include "op_counter.h"
@@ -39,9 +41,11 @@
 #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
 #define CTRL_SET_UM(val, m) (val |= (m << 8))
 #define CTRL_SET_EVENT(val, e) (val |= e)
-
+#define IS_ACTIVE(val) (val & (1 << 22) )  
+#define IS_ENABLE(val) (val & (1 << 20) )
 static unsigned long reset_value[NUM_COUNTERS];
 int ppro_has_global_ctrl = 0;
+extern int is_passive(struct domain *d);
  
 static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
@@ -103,6 +107,7 @@ static int ppro_check_ctrs(unsigned int 
        int ovf = 0;
        unsigned long eip = regs->eip;
        int mode = xenoprofile_get_mode(current, regs);
+       struct arch_msr_pair *msrs_content = vcpu_vpmu(current)->context;
 
        for (i = 0 ; i < NUM_COUNTERS; ++i) {
                if (!reset_value[i])
@@ -111,7 +116,18 @@ static int ppro_check_ctrs(unsigned int 
                if (CTR_OVERFLOWED(low)) {
                        xenoprof_log_event(current, regs, eip, mode, i);
                        CTR_WRITE(reset_value[i], msrs, i);
-                       ovf = 1;
+                       if ( is_passive(current->domain) && (mode != 2) && 
+                               (vcpu_vpmu(current)->flags & 
PASSIVE_DOMAIN_ALLOCATED) ) 
+                       {
+                               if ( IS_ACTIVE(msrs_content[i].control) )
+                               {
+                                       msrs_content[i].counter = (low | 
(u64)high << 32);
+                                       if ( IS_ENABLE(msrs_content[i].control) 
)
+                                               ovf = 2;
+                               }
+                       }
+                       if ( !ovf )
+                               ovf = 1;
                }
        }
 
@@ -159,6 +175,82 @@ static void ppro_stop(struct op_msrs con
         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 }
 
+static int ppro_is_arch_pmu_msr(u64 msr_index, int *type, int *index)
+{
+       if ( (msr_index >= MSR_IA32_PERFCTR0) &&
+            (msr_index < (MSR_IA32_PERFCTR0 + NUM_COUNTERS)) )
+       {
+               *type = MSR_TYPE_ARCH_COUNTER;
+               *index = msr_index - MSR_IA32_PERFCTR0;
+               return 1;
+        }
+        if ( (msr_index >= MSR_P6_EVNTSEL0) &&
+            (msr_index < (MSR_P6_EVNTSEL0 + NUM_CONTROLS)) )
+        {
+               *type = MSR_TYPE_ARCH_CTRL;
+               *index = msr_index - MSR_P6_EVNTSEL0;
+               return 1;
+        }
+
+        return 0;
+}
+
+static int ppro_allocate_msr(struct vcpu *v)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(v);
+       struct arch_msr_pair *msr_content;
+       
+       msr_content = xmalloc_bytes( sizeof(struct arch_msr_pair) * 
NUM_COUNTERS );
+       if ( !msr_content )
+               goto out;
+       memset(msr_content, 0, sizeof(struct arch_msr_pair) * NUM_COUNTERS);
+       vpmu->context = (void *)msr_content;
+       vpmu->flags = 0;
+       vpmu->flags |= PASSIVE_DOMAIN_ALLOCATED;
+       return 1;
+out:
+        gdprintk(XENLOG_WARNING, "Insufficient memory for oprofile, oprofile 
is "
+                 "unavailable on domain %d vcpu %d.\n",
+                 v->vcpu_id, v->domain->domain_id);
+        return 0;      
+}
+
+static void ppro_free_msr(struct vcpu *v)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(v);
+
+       xfree(vpmu->context);
+       vpmu->flags &= ~PASSIVE_DOMAIN_ALLOCATED;
+}
+
+static void ppro_load_msr(struct vcpu *v, int type, int index, u64 
*msr_content)
+{
+       struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
+       switch ( type )
+       {
+       case MSR_TYPE_ARCH_COUNTER:
+               *msr_content = msrs[index].counter;
+               break;
+       case MSR_TYPE_ARCH_CTRL:
+               *msr_content = msrs[index].control;
+               break;
+       }       
+}
+
+static void ppro_save_msr(struct vcpu *v, int type, int index, u64 msr_content)
+{
+       struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
+       
+       switch ( type )
+       {
+       case MSR_TYPE_ARCH_COUNTER:
+               msrs[index].counter = msr_content;
+               break;
+       case MSR_TYPE_ARCH_CTRL:
+               msrs[index].control = msr_content;
+               break;
+       }       
+}
 
 struct op_x86_model_spec const op_ppro_spec = {
        .num_counters = NUM_COUNTERS,
@@ -167,5 +259,10 @@ struct op_x86_model_spec const op_ppro_s
        .setup_ctrs = &ppro_setup_ctrs,
        .check_ctrs = &ppro_check_ctrs,
        .start = &ppro_start,
-       .stop = &ppro_stop
+       .stop = &ppro_stop,
+       .is_arch_pmu_msr = &ppro_is_arch_pmu_msr,
+       .allocated_msr = &ppro_allocate_msr,
+       .free_msr = &ppro_free_msr,
+       .load_msr = &ppro_load_msr,
+       .save_msr = &ppro_save_msr
 };
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/oprofile/op_x86_model.h
--- a/xen/arch/x86/oprofile/op_x86_model.h      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/oprofile/op_x86_model.h      Tue Nov 04 12:43:19 2008 +0900
@@ -41,6 +41,11 @@ struct op_x86_model_spec {
                          struct cpu_user_regs * const regs);
        void (*start)(struct op_msrs const * const msrs);
        void (*stop)(struct op_msrs const * const msrs);
+       int (*is_arch_pmu_msr)(u64 msr_index, int *type, int *index);
+       int (*allocated_msr)(struct vcpu *v);
+       void (*free_msr)(struct vcpu *v);
+       void (*load_msr)(struct vcpu * const v, int type, int index, u64 
*msr_content);
+        void (*save_msr)(struct vcpu * const v, int type, int index, u64 
msr_content);
 };
 
 extern struct op_x86_model_spec const op_ppro_spec;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/setup.c      Tue Nov 04 12:43:19 2008 +0900
@@ -969,6 +969,7 @@ void __init __start_xen(unsigned long mb
     serial_init_postirq();
 
     BUG_ON(!local_irq_is_enabled());
+    spin_debug_enable();
 
     for_each_present_cpu ( i )
     {
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/smpboot.c    Tue Nov 04 12:43:19 2008 +0900
@@ -101,7 +101,7 @@ static int __devinitdata tsc_sync_disabl
 static int __devinitdata tsc_sync_disabled;
 
 /* Per CPU bogomips and other parameters */
-struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+struct cpuinfo_x86 cpu_data[NR_CPUS];
 EXPORT_SYMBOL(cpu_data);
 
 u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
@@ -112,7 +112,7 @@ static void map_cpu_to_logical_apicid(vo
 /* State of each CPU. */
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
-static void *stack_base[NR_CPUS] __cacheline_aligned;
+static void *stack_base[NR_CPUS];
 static DEFINE_SPINLOCK(cpu_add_remove_lock);
 
 /*
@@ -805,14 +805,6 @@ static inline int alloc_cpu_id(void)
        return cpu;
 }
 
-static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
-{
-       if (idle_vcpu[cpu])
-               return idle_vcpu[cpu];
-
-       return alloc_idle_vcpu(cpu);
-}
-
 static void *prepare_idle_stack(unsigned int cpu)
 {
        if (!stack_base[cpu])
@@ -849,7 +841,7 @@ static int __devinit do_boot_cpu(int api
 
        booting_cpu = cpu;
 
-       v = prepare_idle_vcpu(cpu);
+       v = alloc_idle_vcpu(cpu);
        BUG_ON(v == NULL);
 
        /* start_eip had better be page-aligned! */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/time.c       Tue Nov 04 12:43:19 2008 +0900
@@ -1063,8 +1063,6 @@ void init_percpu_time(void)
 /* Late init function (after all CPUs are booted). */
 int __init init_xen_time(void)
 {
-    local_irq_disable();
-
     /* check if TSC is invariant during deep C state
        this is a new feature introduced by Nehalem*/
     if ( cpuid_edx(0x80000007) & (1u<<8) )
@@ -1078,8 +1076,6 @@ int __init init_xen_time(void)
     init_platform_timer();
 
     do_settime(get_cmos_time(), 0, NOW());
-
-    local_irq_enable();
 
     return 0;
 }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/traps.c      Tue Nov 04 12:43:19 2008 +0900
@@ -1030,7 +1030,7 @@ static int handle_gdt_ldt_mapping_fault(
 #endif
 
 static int __spurious_page_fault(
-    unsigned long addr, struct cpu_user_regs *regs)
+    unsigned long addr, unsigned int error_code)
 {
     unsigned long mfn, cr3 = read_cr3();
 #if CONFIG_PAGING_LEVELS >= 4
@@ -1052,17 +1052,17 @@ static int __spurious_page_fault(
         return 0;
 
     /* Reserved bit violations are never spurious faults. */
-    if ( regs->error_code & PFEC_reserved_bit )
+    if ( error_code & PFEC_reserved_bit )
         return 0;
 
     required_flags  = _PAGE_PRESENT;
-    if ( regs->error_code & PFEC_write_access )
+    if ( error_code & PFEC_write_access )
         required_flags |= _PAGE_RW;
-    if ( regs->error_code & PFEC_user_mode )
+    if ( error_code & PFEC_user_mode )
         required_flags |= _PAGE_USER;
 
     disallowed_flags = 0;
-    if ( regs->error_code & PFEC_insn_fetch )
+    if ( error_code & PFEC_insn_fetch )
         disallowed_flags |= _PAGE_NX;
 
     mfn = cr3 >> PAGE_SHIFT;
@@ -1120,7 +1120,7 @@ static int __spurious_page_fault(
     dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
             "at addr %lx, e/c %04x\n",
             current->domain->domain_id, current->vcpu_id,
-            addr, regs->error_code);
+            addr, error_code);
 #if CONFIG_PAGING_LEVELS >= 4
     dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
 #endif
@@ -1129,14 +1129,11 @@ static int __spurious_page_fault(
 #endif
     dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
     dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
-#ifndef NDEBUG
-    show_registers(regs);
-#endif
     return 1;
 }
 
 static int spurious_page_fault(
-    unsigned long addr, struct cpu_user_regs *regs)
+    unsigned long addr, unsigned int error_code)
 {
     unsigned long flags;
     int           is_spurious;
@@ -1146,7 +1143,7 @@ static int spurious_page_fault(
      * page tables from becoming invalid under our feet during the walk.
      */
     local_irq_save(flags);
-    is_spurious = __spurious_page_fault(addr, regs);
+    is_spurious = __spurious_page_fault(addr, error_code);
     local_irq_restore(flags);
 
     return is_spurious;
@@ -1208,8 +1205,12 @@ asmlinkage void do_page_fault(struct cpu
 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
 {
     unsigned long addr, fixup;
+    unsigned int error_code;
 
     addr = read_cr2();
+
+    /* fixup_page_fault() might change regs->error_code, so cache it here. */
+    error_code = regs->error_code;
 
     DEBUGGER_trap_entry(TRAP_page_fault, regs);
 
@@ -1220,7 +1221,7 @@ asmlinkage void do_page_fault(struct cpu
 
     if ( unlikely(!guest_mode(regs)) )
     {
-        if ( spurious_page_fault(addr, regs) )
+        if ( spurious_page_fault(addr, error_code) )
             return;
 
         if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
@@ -1239,11 +1240,11 @@ asmlinkage void do_page_fault(struct cpu
         panic("FATAL PAGE FAULT\n"
               "[error_code=%04x]\n"
               "Faulting linear address: %p\n",
-              regs->error_code, _p(addr));
+              error_code, _p(addr));
     }
 
     if ( unlikely(current->domain->arch.suppress_spurious_page_faults
-                  && spurious_page_fault(addr, regs)) )
+                  && spurious_page_fault(addr, error_code)) )
         return;
 
     propagate_page_fault(addr, regs->error_code);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/x86_32/domain_page.c
--- a/xen/arch/x86/x86_32/domain_page.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/x86_32/domain_page.c Tue Nov 04 12:43:19 2008 +0900
@@ -43,7 +43,7 @@ void *map_domain_page(unsigned long mfn)
 void *map_domain_page(unsigned long mfn)
 {
     unsigned long va;
-    unsigned int idx, i;
+    unsigned int idx, i, flags;
     struct vcpu *v;
     struct mapcache_domain *dcache;
     struct mapcache_vcpu *vcache;
@@ -69,7 +69,7 @@ void *map_domain_page(unsigned long mfn)
         goto out;
     }
 
-    spin_lock(&dcache->lock);
+    spin_lock_irqsave(&dcache->lock, flags);
 
     /* Has some other CPU caused a wrap? We must flush if so. */
     if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
@@ -105,7 +105,7 @@ void *map_domain_page(unsigned long mfn)
     set_bit(idx, dcache->inuse);
     dcache->cursor = idx + 1;
 
-    spin_unlock(&dcache->lock);
+    spin_unlock_irqrestore(&dcache->lock, flags);
 
     l1e_write(&dcache->l1tab[idx], l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
 
@@ -114,7 +114,7 @@ void *map_domain_page(unsigned long mfn)
     return (void *)va;
 }
 
-void unmap_domain_page(void *va)
+void unmap_domain_page(const void *va)
 {
     unsigned int idx;
     struct vcpu *v;
@@ -241,7 +241,7 @@ void *map_domain_page_global(unsigned lo
     return (void *)va;
 }
 
-void unmap_domain_page_global(void *va)
+void unmap_domain_page_global(const void *va)
 {
     unsigned long __va = (unsigned long)va;
     l2_pgentry_t *pl2e;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/x86_64/compat/mm.c
--- a/xen/arch/x86/x86_64/compat/mm.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/x86_64/compat/mm.c   Tue Nov 04 12:43:19 2008 +0900
@@ -231,6 +231,8 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
             case MMUEXT_PIN_L4_TABLE:
             case MMUEXT_UNPIN_TABLE:
             case MMUEXT_NEW_BASEPTR:
+            case MMUEXT_CLEAR_PAGE:
+            case MMUEXT_COPY_PAGE:
                 arg1 = XLAT_mmuext_op_arg1_mfn;
                 break;
             default:
@@ -257,6 +259,9 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
             case MMUEXT_TLB_FLUSH_MULTI:
             case MMUEXT_INVLPG_MULTI:
                 arg2 = XLAT_mmuext_op_arg2_vcpumask;
+                break;
+            case MMUEXT_COPY_PAGE:
+                arg2 = XLAT_mmuext_op_arg2_src_mfn;
                 break;
             default:
                 arg2 = -1;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/x86_64/cpufreq.c
--- a/xen/arch/x86/x86_64/cpufreq.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/x86_64/cpufreq.c     Tue Nov 04 12:43:19 2008 +0900
@@ -56,34 +56,13 @@ compat_set_px_pminfo(uint32_t cpu, struc
        return -EFAULT;
 
 #define XLAT_processor_performance_HNDL_states(_d_, _s_) do { \
-    xen_processor_px_t *xen_states = NULL; \
-\
-    if ( likely((_s_)->state_count > 0) ) \
-    { \
-        XEN_GUEST_HANDLE(compat_processor_px_t) states; \
-        compat_processor_px_t state; \
-        int i; \
-\
-        xen_states = xlat_malloc_array(xlat_page_current, \
-                               xen_processor_px_t, (_s_)->state_count); \
-        if ( unlikely(xen_states == NULL) ) \
-            return -EFAULT; \
-\
-        if ( unlikely(!compat_handle_okay((_s_)->states, \
-                                (_s_)->state_count)) ) \
-            return -EFAULT; \
-        guest_from_compat_handle(states, (_s_)->states); \
-\
-        for ( i = 0; i < _s_->state_count; i++ ) \
-        { \
-           if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) ) \
-               return -EFAULT; \
-           XLAT_processor_px(&xen_states[i], &state); \
-        } \
-    } \
-\
-    set_xen_guest_handle((_d_)->states, xen_states); \
+    XEN_GUEST_HANDLE(compat_processor_px_t) states; \
+    if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->state_count)) ) \
+        return -EFAULT; \
+    guest_from_compat_handle(states, (_s_)->states); \
+    (_d_)->states = guest_handle_cast(states, xen_processor_px_t); \
 } while (0)
+
     XLAT_processor_performance(xen_perf, perf);
 #undef XLAT_processor_performance_HNDL_states
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/event_channel.c
--- a/xen/common/event_channel.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/event_channel.c        Tue Nov 04 12:43:19 2008 +0900
@@ -386,7 +386,7 @@ static long __evtchn_close(struct domain
             if ( v->virq_to_evtchn[chn1->u.virq] != port1 )
                 continue;
             v->virq_to_evtchn[chn1->u.virq] = 0;
-            spin_barrier(&v->virq_lock);
+            spin_barrier_irq(&v->virq_lock);
         }
         break;
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/kernel.c
--- a/xen/common/kernel.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/kernel.c       Tue Nov 04 12:43:19 2008 +0900
@@ -221,7 +221,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL
                 fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
 #ifdef CONFIG_X86
             if ( !is_hvm_vcpu(current) )
-                fi.submap |= 1U << XENFEAT_mmu_pt_update_preserve_ad;
+                fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) |
+                             (1U << XENFEAT_highmem_assist);
 #endif
             break;
         default:
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/keyhandler.c
--- a/xen/common/keyhandler.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/keyhandler.c   Tue Nov 04 12:43:19 2008 +0900
@@ -183,9 +183,9 @@ static void dump_domains(unsigned char k
     {
         printk("General information for domain %u:\n", d->domain_id);
         cpuset_print(tmpstr, sizeof(tmpstr), d->domain_dirty_cpumask);
-        printk("    refcnt=%d nr_pages=%d xenheap_pages=%d "
+        printk("    refcnt=%d dying=%d nr_pages=%d xenheap_pages=%d "
                "dirty_cpus=%s\n",
-               atomic_read(&d->refcnt),
+               atomic_read(&d->refcnt), d->is_dying,
                d->tot_pages, d->xenheap_pages, tmpstr);
         printk("    handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-"
                "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n",
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/spinlock.c
--- a/xen/common/spinlock.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/spinlock.c     Tue Nov 04 12:43:19 2008 +0900
@@ -1,15 +1,56 @@
 #include <xen/config.h>
+#include <xen/irq.h>
 #include <xen/smp.h>
 #include <xen/spinlock.h>
 
+#ifndef NDEBUG
+
+static atomic_t spin_debug __read_mostly = ATOMIC_INIT(0);
+
+static void check_lock(struct lock_debug *debug)
+{
+    int irq_safe = !local_irq_is_enabled();
+
+    if ( unlikely(atomic_read(&spin_debug) <= 0) )
+        return;
+
+    /* A few places take liberties with this. */
+    /* BUG_ON(in_irq() && !irq_safe); */
+
+    if ( unlikely(debug->irq_safe != irq_safe) )
+    {
+        int seen = cmpxchg(&debug->irq_safe, -1, irq_safe);
+        BUG_ON(seen == !irq_safe);
+    }
+}
+
+void spin_debug_enable(void)
+{
+    atomic_inc(&spin_debug);
+}
+
+void spin_debug_disable(void)
+{
+    atomic_dec(&spin_debug);
+}
+
+#else /* defined(NDEBUG) */
+
+#define check_lock(l) ((void)0)
+
+#endif
+
 void _spin_lock(spinlock_t *lock)
 {
+    check_lock(&lock->debug);
     _raw_spin_lock(&lock->raw);
 }
 
 void _spin_lock_irq(spinlock_t *lock)
 {
-    local_irq_disable();
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    check_lock(&lock->debug);
     _raw_spin_lock(&lock->raw);
 }
 
@@ -17,6 +58,7 @@ unsigned long _spin_lock_irqsave(spinloc
 {
     unsigned long flags;
     local_irq_save(flags);
+    check_lock(&lock->debug);
     _raw_spin_lock(&lock->raw);
     return flags;
 }
@@ -40,26 +82,39 @@ void _spin_unlock_irqrestore(spinlock_t 
 
 int _spin_is_locked(spinlock_t *lock)
 {
+    check_lock(&lock->debug);
     return _raw_spin_is_locked(&lock->raw);
 }
 
 int _spin_trylock(spinlock_t *lock)
 {
+    check_lock(&lock->debug);
     return _raw_spin_trylock(&lock->raw);
 }
 
 void _spin_barrier(spinlock_t *lock)
 {
+    check_lock(&lock->debug);
     do { mb(); } while ( _raw_spin_is_locked(&lock->raw) );
     mb();
 }
 
+void _spin_barrier_irq(spinlock_t *lock)
+{
+    unsigned long flags;
+    local_irq_save(flags);
+    _spin_barrier(lock);
+    local_irq_restore(flags);
+}
+
 void _spin_lock_recursive(spinlock_t *lock)
 {
     int cpu = smp_processor_id();
 
     /* Don't allow overflow of recurse_cpu field. */
     BUILD_BUG_ON(NR_CPUS > 0xfffu);
+
+    check_lock(&lock->debug);
 
     if ( likely(lock->recurse_cpu != cpu) )
     {
@@ -83,12 +138,15 @@ void _spin_unlock_recursive(spinlock_t *
 
 void _read_lock(rwlock_t *lock)
 {
+    check_lock(&lock->debug);
     _raw_read_lock(&lock->raw);
 }
 
 void _read_lock_irq(rwlock_t *lock)
 {
-    local_irq_disable();
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    check_lock(&lock->debug);
     _raw_read_lock(&lock->raw);
 }
 
@@ -96,6 +154,7 @@ unsigned long _read_lock_irqsave(rwlock_
 {
     unsigned long flags;
     local_irq_save(flags);
+    check_lock(&lock->debug);
     _raw_read_lock(&lock->raw);
     return flags;
 }
@@ -119,12 +178,15 @@ void _read_unlock_irqrestore(rwlock_t *l
 
 void _write_lock(rwlock_t *lock)
 {
+    check_lock(&lock->debug);
     _raw_write_lock(&lock->raw);
 }
 
 void _write_lock_irq(rwlock_t *lock)
 {
-    local_irq_disable();
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    check_lock(&lock->debug);
     _raw_write_lock(&lock->raw);
 }
 
@@ -132,6 +194,7 @@ unsigned long _write_lock_irqsave(rwlock
 {
     unsigned long flags;
     local_irq_save(flags);
+    check_lock(&lock->debug);
     _raw_write_lock(&lock->raw);
     return flags;
 }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/timer.c
--- a/xen/common/timer.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/timer.c        Tue Nov 04 12:43:19 2008 +0900
@@ -25,10 +25,12 @@
  * We pull handlers off the timer list this far in future,
  * rather than reprogramming the time hardware.
  */
-#define TIMER_SLOP (50*1000) /* ns */
+static unsigned int timer_slop __read_mostly = 50000; /* 50 us */
+integer_param("timer_slop", timer_slop);
 
 struct timers {
     spinlock_t     lock;
+    bool_t         overflow;
     struct timer **heap;
     struct timer  *list;
     struct timer  *running;
@@ -200,6 +202,7 @@ static int add_entry(struct timers *time
         return rc;
 
     /* Fall back to adding to the slower linked list. */
+    timers->overflow = 1;
     t->status = TIMER_STATUS_in_list;
     return add_to_list(&timers->list, t);
 }
@@ -258,6 +261,7 @@ void set_timer(struct timer *timer, s_ti
         __stop_timer(timer);
 
     timer->expires = expires;
+    timer->expires_end = expires + timer_slop;
 
     if ( likely(timer->status != TIMER_STATUS_killed) )
         __add_timer(timer);
@@ -344,19 +348,30 @@ void kill_timer(struct timer *timer)
 }
 
 
+static void execute_timer(struct timers *ts, struct timer *t)
+{
+    void (*fn)(void *) = t->function;
+    void *data = t->data;
+
+    ts->running = t;
+    spin_unlock_irq(&ts->lock);
+    (*fn)(data);
+    spin_lock_irq(&ts->lock);
+    ts->running = NULL;
+}
+
+
 static void timer_softirq_action(void)
 {
     struct timer  *t, **heap, *next;
     struct timers *ts;
-    s_time_t       now, deadline;
-    void         (*fn)(void *);
-    void          *data;
+    s_time_t       now;
 
     ts = &this_cpu(timers);
     heap = ts->heap;
 
-    /* If we are using overflow linked list, try to allocate a larger heap. */
-    if ( unlikely(ts->list != NULL) )
+    /* If we overflowed the heap, try to allocate a larger heap. */
+    if ( unlikely(ts->overflow) )
     {
         /* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */
         int old_limit = GET_HEAP_LIMIT(heap);
@@ -377,7 +392,26 @@ static void timer_softirq_action(void)
 
     spin_lock_irq(&ts->lock);
 
-    /* Try to move timers from overflow linked list to more efficient heap. */
+    now = NOW();
+
+    /* Execute ready heap timers. */
+    while ( (GET_HEAP_SIZE(heap) != 0) &&
+            ((t = heap[1])->expires_end < now) )
+    {
+        remove_from_heap(heap, t);
+        t->status = TIMER_STATUS_inactive;
+        execute_timer(ts, t);
+    }
+
+    /* Execute ready list timers. */
+    while ( ((t = ts->list) != NULL) && (t->expires_end < now) )
+    {
+        ts->list = t->list_next;
+        t->status = TIMER_STATUS_inactive;
+        execute_timer(ts, t);
+    }
+
+    /* Try to move timers from linked list to more efficient heap. */
     next = ts->list;
     ts->list = NULL;
     while ( unlikely((t = next) != NULL) )
@@ -387,51 +421,44 @@ static void timer_softirq_action(void)
         add_entry(ts, t);
     }
 
-    now = NOW();
-
-    while ( (GET_HEAP_SIZE(heap) != 0) &&
-            ((t = heap[1])->expires < (now + TIMER_SLOP)) )
-    {
-        remove_entry(ts, t);
-
-        ts->running = t;
-
-        fn   = t->function;
-        data = t->data;
-
-        spin_unlock_irq(&ts->lock);
-        (*fn)(data);
-        spin_lock_irq(&ts->lock);
-    }
-
-    deadline = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
-
-    while ( unlikely((t = ts->list) != NULL) )
-    {
-        if ( t->expires >= (now + TIMER_SLOP) )
+    ts->overflow = (ts->list != NULL);
+    if ( unlikely(ts->overflow) )
+    {
+        /* Find earliest deadline at head of list or top of heap. */
+        this_cpu(timer_deadline) = ts->list->expires;
+        if ( (GET_HEAP_SIZE(heap) != 0) &&
+             ((t = heap[1])->expires < this_cpu(timer_deadline)) )
+            this_cpu(timer_deadline) = t->expires;
+    }
+    else
+    {
+        /*
+         * Find the earliest deadline that encompasses largest number of timers
+         * on the heap. To do this we take timers from the heap while their
+         * valid deadline ranges continue to intersect.
+         */
+        s_time_t start = 0, end = STIME_MAX;
+        struct timer **list_tail = &ts->list;
+
+        while ( (GET_HEAP_SIZE(heap) != 0) &&
+                ((t = heap[1])->expires <= end) )
         {
-            if ( (deadline == 0) || (deadline > t->expires) )
-                deadline = t->expires;
-            break;
+            remove_entry(ts, t);
+
+            t->status = TIMER_STATUS_in_list;
+            t->list_next = NULL;
+            *list_tail = t;
+            list_tail = &t->list_next;
+
+            start = t->expires;
+            if ( end > t->expires_end )
+                end = t->expires_end;
         }
 
-        ts->list = t->list_next;
-        t->status = TIMER_STATUS_inactive;
-
-        ts->running = t;
-
-        fn   = t->function;
-        data = t->data;
-
-        spin_unlock_irq(&ts->lock);
-        (*fn)(data);
-        spin_lock_irq(&ts->lock);
-    }
-
-    ts->running = NULL;
-
-    this_cpu(timer_deadline) = deadline;
-    if ( !reprogram_timer(deadline) )
+        this_cpu(timer_deadline) = start;
+    }
+
+    if ( !reprogram_timer(this_cpu(timer_deadline)) )
         raise_softirq(TIMER_SOFTIRQ);
 
     spin_unlock_irq(&ts->lock);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/xenoprof.c
--- a/xen/common/xenoprof.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/xenoprof.c     Tue Nov 04 12:43:19 2008 +0900
@@ -85,7 +85,7 @@ int is_active(struct domain *d)
     return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_ACTIVE));
 }
 
-static int is_passive(struct domain *d)
+int is_passive(struct domain *d)
 {
     struct xenoprof *x = d->xenoprof;
     return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_PASSIVE));
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/xmalloc.c
--- a/xen/common/xmalloc.c      Tue Nov 04 12:07:22 2008 +0900
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,286 +0,0 @@
-/******************************************************************************
- * Simple allocator for Xen.  If larger than a page, simply use the
- * page-order allocator.
- *
- * Copyright (C) 2005 Rusty Russell IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-/*
- * TODO (Keir, 17/2/05):
- *  1. Use space in page_info to avoid xmalloc_hdr in allocated blocks.
- *  2. page_info points into free list to make xfree() O(1) complexity.
- *  3. Perhaps make this a sub-page buddy allocator? xmalloc() == O(1).
- *     (Disadvantage is potentially greater internal fragmentation).
- */
-
-#include <xen/config.h>
-#include <xen/mm.h>
-#include <xen/spinlock.h>
-#include <xen/timer.h>
-#include <xen/cache.h>
-#include <xen/prefetch.h>
-#include <xen/irq.h>
-#include <xen/smp.h>
-
-/*
- * XMALLOC_DEBUG:
- *  1. Free data blocks are filled with poison bytes.
- *  2. In-use data blocks have guard bytes at the start and end.
- */
-#ifndef NDEBUG
-#define XMALLOC_DEBUG 1
-#endif
-
-static LIST_HEAD(freelist);
-static DEFINE_SPINLOCK(freelist_lock);
-
-struct xmalloc_hdr
-{
-    /* Size is total including this header. */
-    size_t size;
-    struct list_head freelist;
-} __cacheline_aligned;
-
-static void add_to_freelist(struct xmalloc_hdr *hdr)
-{
-#if XMALLOC_DEBUG
-    memset(hdr + 1, 0xa5, hdr->size - sizeof(*hdr));
-#endif
-    list_add(&hdr->freelist, &freelist);
-}
-
-static void del_from_freelist(struct xmalloc_hdr *hdr)
-{
-#if XMALLOC_DEBUG
-    size_t i;
-    unsigned char *data = (unsigned char *)(hdr + 1);
-    for ( i = 0; i < (hdr->size - sizeof(*hdr)); i++ )
-        BUG_ON(data[i] != 0xa5);
-    BUG_ON((hdr->size <= 0) || (hdr->size >= PAGE_SIZE));
-#endif
-    list_del(&hdr->freelist);
-}
-
-static void *data_from_header(struct xmalloc_hdr *hdr)
-{
-#if XMALLOC_DEBUG
-    /* Data block contain SMP_CACHE_BYTES of guard canary. */
-    unsigned char *data = (unsigned char *)(hdr + 1);
-    memset(data, 0x5a, SMP_CACHE_BYTES);
-    memset(data + hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES,
-           0x5a, SMP_CACHE_BYTES);
-    return data + SMP_CACHE_BYTES;
-#else
-    return hdr + 1;
-#endif
-}
-
-static struct xmalloc_hdr *header_from_data(void *p)
-{
-#if XMALLOC_DEBUG
-    unsigned char *data = (unsigned char *)p - SMP_CACHE_BYTES;
-    struct xmalloc_hdr *hdr = (struct xmalloc_hdr *)data - 1;
-    size_t i;
-
-    /* Check header guard canary. */
-    for ( i = 0; i < SMP_CACHE_BYTES; i++ )
-        BUG_ON(data[i] != 0x5a);
-
-    /* Check footer guard canary. */
-    data += hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES;
-    for ( i = 0; i < SMP_CACHE_BYTES; i++ )
-        BUG_ON(data[i] != 0x5a);
-
-    return hdr;
-#else
-    return (struct xmalloc_hdr *)p - 1;
-#endif
-}
-
-static void maybe_split(struct xmalloc_hdr *hdr, size_t size, size_t block)
-{
-    struct xmalloc_hdr *extra;
-    size_t leftover = block - size;
-
-    /* If enough is left to make a block, put it on free list. */
-    if ( leftover >= (2 * sizeof(struct xmalloc_hdr)) )
-    {
-        extra = (struct xmalloc_hdr *)((unsigned long)hdr + size);
-        extra->size = leftover;
-        add_to_freelist(extra);
-    }
-    else
-    {
-        size = block;
-    }
-
-    hdr->size = size;
-    /* Debugging aid. */
-    hdr->freelist.next = hdr->freelist.prev = NULL;
-}
-
-static void *xmalloc_new_page(size_t size)
-{
-    struct xmalloc_hdr *hdr;
-
-    hdr = alloc_xenheap_page();
-    if ( hdr == NULL )
-        return NULL;
-
-    spin_lock(&freelist_lock);
-    maybe_split(hdr, size, PAGE_SIZE);
-    spin_unlock(&freelist_lock);
-
-    return data_from_header(hdr);
-}
-
-/* Big object?  Just use the page allocator. */
-static void *xmalloc_whole_pages(size_t size)
-{
-    struct xmalloc_hdr *hdr;
-    unsigned int pageorder = get_order_from_bytes(size);
-
-    hdr = alloc_xenheap_pages(pageorder);
-    if ( hdr == NULL )
-        return NULL;
-
-    hdr->size = (1 << (pageorder + PAGE_SHIFT));
-    /* Debugging aid. */
-    hdr->freelist.next = hdr->freelist.prev = NULL;
-
-    return data_from_header(hdr);
-}
-
-/* Return size, increased to alignment with align. */
-static inline size_t align_up(size_t size, size_t align)
-{
-    return (size + align - 1) & ~(align - 1);
-}
-
-void *_xmalloc(size_t size, size_t align)
-{
-    struct xmalloc_hdr *i;
-
-    ASSERT(!in_irq());
-
-    /* We currently always return cacheline aligned. */
-    BUG_ON(align > SMP_CACHE_BYTES);
-
-#if XMALLOC_DEBUG
-    /* Add room for canaries at start and end of data block. */
-    size += 2 * SMP_CACHE_BYTES;
-#endif
-
-    /* Add room for header, pad to align next header. */
-    size += sizeof(struct xmalloc_hdr);
-    size = align_up(size, __alignof__(struct xmalloc_hdr));
-
-    /* For big allocs, give them whole pages. */
-    if ( size >= PAGE_SIZE )
-        return xmalloc_whole_pages(size);
-
-    /* Search free list. */
-    spin_lock(&freelist_lock);
-    list_for_each_entry( i, &freelist, freelist )
-    {
-        if ( i->size < size )
-            continue;
-        del_from_freelist(i);
-        maybe_split(i, size, i->size);
-        spin_unlock(&freelist_lock);
-        return data_from_header(i);
-    }
-    spin_unlock(&freelist_lock);
-
-    /* Alloc a new page and return from that. */
-    return xmalloc_new_page(size);
-}
-
-void xfree(void *p)
-{
-    struct xmalloc_hdr *i, *tmp, *hdr;
-
-    ASSERT(!in_irq());
-
-    if ( p == NULL )
-        return;
-
-    hdr = header_from_data(p);
-
-    /* We know hdr will be on same page. */
-    BUG_ON(((long)p & PAGE_MASK) != ((long)hdr & PAGE_MASK));
-
-    /* Not previously freed. */
-    BUG_ON(hdr->freelist.next || hdr->freelist.prev);
-
-    /* Big allocs free directly. */
-    if ( hdr->size >= PAGE_SIZE )
-    {
-        free_xenheap_pages(hdr, get_order_from_bytes(hdr->size));
-        return;
-    }
-
-    /* Merge with other free block, or put in list. */
-    spin_lock(&freelist_lock);
-    list_for_each_entry_safe( i, tmp, &freelist, freelist )
-    {
-        unsigned long _i   = (unsigned long)i;
-        unsigned long _hdr = (unsigned long)hdr;
-
-        /* Do not merge across page boundaries. */
-        if ( ((_i ^ _hdr) & PAGE_MASK) != 0 )
-            continue;
-
-        /* We follow this block?  Swallow it. */
-        if ( (_i + i->size) == _hdr )
-        {
-            del_from_freelist(i);
-            i->size += hdr->size;
-            hdr = i;
-        }
-
-        /* We precede this block? Swallow it. */
-        if ( (_hdr + hdr->size) == _i )
-        {
-            del_from_freelist(i);
-            hdr->size += i->size;
-        }
-    }
-
-    /* Did we merge an entire page? */
-    if ( hdr->size == PAGE_SIZE )
-    {
-        BUG_ON((((unsigned long)hdr) & (PAGE_SIZE-1)) != 0);
-        free_xenheap_pages(hdr, 0);
-    }
-    else
-    {
-        add_to_freelist(hdr);
-    }
-
-    spin_unlock(&freelist_lock);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/drivers/char/serial.c
--- a/xen/drivers/char/serial.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/drivers/char/serial.c Tue Nov 04 12:43:19 2008 +0900
@@ -74,7 +74,7 @@ void serial_tx_interrupt(struct serial_p
     while ( !spin_trylock(&port->tx_lock) )
     {
         if ( !port->driver->tx_empty(port) )
-            return;
+            goto out;
         cpu_relax();
     }
 
@@ -89,7 +89,10 @@ void serial_tx_interrupt(struct serial_p
         }
     }
 
-    spin_unlock_irqrestore(&port->tx_lock, flags);
+    spin_unlock(&port->tx_lock);
+
+ out:
+    local_irq_restore(flags);
 }
 
 static void __serial_putc(struct serial_port *port, char c)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/drivers/cpufreq/cpufreq.c
--- a/xen/drivers/cpufreq/cpufreq.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/drivers/cpufreq/cpufreq.c     Tue Nov 04 12:43:19 2008 +0900
@@ -31,6 +31,7 @@
 #include <xen/errno.h>
 #include <xen/delay.h>
 #include <xen/cpumask.h>
+#include <xen/list.h>
 #include <xen/sched.h>
 #include <xen/timer.h>
 #include <xen/xmalloc.h>
@@ -44,8 +45,12 @@
 #include <acpi/acpi.h>
 #include <acpi/cpufreq/cpufreq.h>
 
-/* TODO: change to link list later as domain number may be sparse */
-static cpumask_t cpufreq_dom_map[NR_CPUS];
+struct cpufreq_dom {
+    unsigned int       dom;
+    cpumask_t          map;
+    struct list_head   node;
+};
+static LIST_HEAD(cpufreq_dom_list_head);
 
 int cpufreq_limit_change(unsigned int cpu)
 {
@@ -72,48 +77,80 @@ int cpufreq_add_cpu(unsigned int cpu)
 {
     int ret = 0;
     unsigned int firstcpu;
-    unsigned int dom;
+    unsigned int dom, domexist = 0;
     unsigned int j;
+    struct list_head *pos;
+    struct cpufreq_dom *cpufreq_dom = NULL;
     struct cpufreq_policy new_policy;
     struct cpufreq_policy *policy;
     struct processor_performance *perf = &processor_pminfo[cpu]->perf;
 
     /* to protect the case when Px was not controlled by xen */
-    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
+    if (!processor_pminfo[cpu]      ||
+        !(perf->init & XEN_PX_INIT) ||
+        !cpu_online(cpu))
+        return -EINVAL;
+
+    if (cpufreq_cpu_policy[cpu])
         return 0;
-
-    if (!cpu_online(cpu) || cpufreq_cpu_policy[cpu])
-        return -EINVAL;
 
     ret = cpufreq_statistic_init(cpu);
     if (ret)
         return ret;
 
     dom = perf->domain_info.domain;
-    if (cpus_weight(cpufreq_dom_map[dom])) {
+
+    list_for_each(pos, &cpufreq_dom_list_head) {
+        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
+        if (dom == cpufreq_dom->dom) {
+            domexist = 1;
+            break;
+        }
+    }
+
+    if (domexist) {
         /* share policy with the first cpu since on same boat */
-        firstcpu = first_cpu(cpufreq_dom_map[dom]);
+        firstcpu = first_cpu(cpufreq_dom->map);
         policy = cpufreq_cpu_policy[firstcpu];
 
         cpufreq_cpu_policy[cpu] = policy;
-        cpu_set(cpu, cpufreq_dom_map[dom]);
+        cpu_set(cpu, cpufreq_dom->map);
         cpu_set(cpu, policy->cpus);
+
+        /* domain coordination sanity check */
+        if ((perf->domain_info.coord_type !=
+             processor_pminfo[firstcpu]->perf.domain_info.coord_type) ||
+            (perf->domain_info.num_processors !=
+             processor_pminfo[firstcpu]->perf.domain_info.num_processors)) {
+            ret = -EINVAL;
+            goto err2;
+        }
 
         printk(KERN_EMERG"adding CPU %u\n", cpu);
     } else {
+        cpufreq_dom = xmalloc(struct cpufreq_dom);
+        if (!cpufreq_dom) {
+            cpufreq_statistic_exit(cpu);
+            return -ENOMEM;
+        }
+        memset(cpufreq_dom, 0, sizeof(struct cpufreq_dom));
+        cpufreq_dom->dom = dom;
+        cpu_set(cpu, cpufreq_dom->map);
+        list_add(&cpufreq_dom->node, &cpufreq_dom_list_head);
+
         /* for the first cpu, setup policy and do init work */
         policy = xmalloc(struct cpufreq_policy);
         if (!policy) {
+            list_del(&cpufreq_dom->node);
+            xfree(cpufreq_dom);
             cpufreq_statistic_exit(cpu);
             return -ENOMEM;
         }
         memset(policy, 0, sizeof(struct cpufreq_policy));
-
+        policy->cpu = cpu;
+        cpu_set(cpu, policy->cpus);
         cpufreq_cpu_policy[cpu] = policy;
-        cpu_set(cpu, cpufreq_dom_map[dom]);
-        cpu_set(cpu, policy->cpus);
-
-        policy->cpu = cpu;
+
         ret = cpufreq_driver->init(policy);
         if (ret)
             goto err1;
@@ -124,7 +161,7 @@ int cpufreq_add_cpu(unsigned int cpu)
      * After get full cpumap of the coordination domain,
      * we can safely start gov here.
      */
-    if (cpus_weight(cpufreq_dom_map[dom]) ==
+    if (cpus_weight(cpufreq_dom->map) ==
         perf->domain_info.num_processors) {
         memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
         policy->governor = NULL;
@@ -138,51 +175,68 @@ err2:
 err2:
     cpufreq_driver->exit(policy);
 err1:
-    for_each_cpu_mask(j, cpufreq_dom_map[dom]) {
+    for_each_cpu_mask(j, cpufreq_dom->map) {
         cpufreq_cpu_policy[j] = NULL;
         cpufreq_statistic_exit(j);
     }
 
-    cpus_clear(cpufreq_dom_map[dom]);
+    list_del(&cpufreq_dom->node);
+    xfree(cpufreq_dom);
     xfree(policy);
     return ret;
 }
 
 int cpufreq_del_cpu(unsigned int cpu)
 {
-    unsigned int dom;
+    unsigned int dom, domexist = 0;
+    struct list_head *pos;
+    struct cpufreq_dom *cpufreq_dom = NULL;
     struct cpufreq_policy *policy;
     struct processor_performance *perf = &processor_pminfo[cpu]->perf;
 
     /* to protect the case when Px was not controlled by xen */
-    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
+    if (!processor_pminfo[cpu]      ||
+        !(perf->init & XEN_PX_INIT) ||
+        !cpu_online(cpu))
+        return -EINVAL;
+
+    if (!cpufreq_cpu_policy[cpu])
         return 0;
-
-    if (!cpu_online(cpu) || !cpufreq_cpu_policy[cpu])
-        return -EINVAL;
 
     dom = perf->domain_info.domain;
     policy = cpufreq_cpu_policy[cpu];
 
-    printk(KERN_EMERG"deleting CPU %u\n", cpu);
+    list_for_each(pos, &cpufreq_dom_list_head) {
+        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
+        if (dom == cpufreq_dom->dom) {
+            domexist = 1;
+            break;
+        }
+    }
+
+    if (!domexist)
+        return -EINVAL;
 
     /* for the first cpu of the domain, stop gov */
-    if (cpus_weight(cpufreq_dom_map[dom]) ==
+    if (cpus_weight(cpufreq_dom->map) ==
         perf->domain_info.num_processors)
         __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 
     cpufreq_cpu_policy[cpu] = NULL;
     cpu_clear(cpu, policy->cpus);
-    cpu_clear(cpu, cpufreq_dom_map[dom]);
+    cpu_clear(cpu, cpufreq_dom->map);
     cpufreq_statistic_exit(cpu);
 
     /* for the last cpu of the domain, clean room */
     /* It's safe here to free freq_table, drv_data and policy */
-    if (!cpus_weight(cpufreq_dom_map[dom])) {
+    if (!cpus_weight(cpufreq_dom->map)) {
         cpufreq_driver->exit(policy);
+        list_del(&cpufreq_dom->node);
+        xfree(cpufreq_dom);
         xfree(policy);
     }
 
+    printk(KERN_EMERG"deleting CPU %u\n", cpu);
     return 0;
 }
 
@@ -258,6 +312,24 @@ int set_px_pminfo(uint32_t acpi_id, stru
 
     if ( dom0_px_info->flags & XEN_PX_PCT )
     {
+        /* space_id check */
+        if (dom0_px_info->control_register.space_id != 
+            dom0_px_info->status_register.space_id)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+
+#ifdef CONFIG_IA64
+        /* for IA64, currently it only supports FFH */
+        if (dom0_px_info->control_register.space_id !=
+            ACPI_ADR_SPACE_FIXED_HARDWARE)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+#endif
+
         memcpy ((void *)&pxpt->control_register,
                 (void *)&dom0_px_info->control_register,
                 sizeof(struct xen_pct_register));
@@ -267,8 +339,16 @@ int set_px_pminfo(uint32_t acpi_id, stru
         print_PCT(&pxpt->control_register);
         print_PCT(&pxpt->status_register);
     }
+
     if ( dom0_px_info->flags & XEN_PX_PSS ) 
     {
+        /* capability check */
+        if (dom0_px_info->state_count <= 1)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+
         if ( !(pxpt->states = xmalloc_array(struct xen_processor_px,
                         dom0_px_info->state_count)) )
         {
@@ -280,14 +360,28 @@ int set_px_pminfo(uint32_t acpi_id, stru
         pxpt->state_count = dom0_px_info->state_count;
         print_PSS(pxpt->states,pxpt->state_count);
     }
+
     if ( dom0_px_info->flags & XEN_PX_PSD )
     {
+#ifdef CONFIG_X86
+        /* for X86, check domain coordination */
+        /* for IA64, _PSD is optional for current IA64 cpufreq algorithm */
+        if (dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ALL &&
+            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ANY &&
+            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_HW)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+#endif
+
         pxpt->shared_type = dom0_px_info->shared_type;
         memcpy ((void *)&pxpt->domain_info,
                 (void *)&dom0_px_info->domain_info,
                 sizeof(struct xen_psd_package));
         print_PSD(&pxpt->domain_info);
     }
+
     if ( dom0_px_info->flags & XEN_PX_PPC )
     {
         pxpt->platform_limit = dom0_px_info->platform_limit;
@@ -295,7 +389,6 @@ int set_px_pminfo(uint32_t acpi_id, stru
 
         if ( pxpt->init == XEN_PX_INIT )
         {
-
             ret = cpufreq_limit_change(cpuid); 
             goto out;
         }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/config.h      Tue Nov 04 12:43:19 2008 +0900
@@ -40,14 +40,6 @@
 
 #define CONFIG_HOTPLUG 1
 #define CONFIG_HOTPLUG_CPU 1
-
-/*
- * Avoid deep recursion when tearing down pagetables during domain destruction,
- * causing dom0 to become unresponsive and Xen to miss time-critical softirq
- * deadlines. This will ultimately be replaced by built-in preemptibility of
- * get_page_type().
- */
-#define DOMAIN_DESTRUCT_AVOID_RECURSION 1
 
 #define HZ 100
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/event.h
--- a/xen/include/asm-x86/event.h       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/event.h       Tue Nov 04 12:43:19 2008 +0900
@@ -11,36 +11,8 @@
 
 #include <xen/shared.h>
 
-static inline void vcpu_kick(struct vcpu *v)
-{
-    /*
-     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
-     * pending flag. These values may fluctuate (after all, we hold no
-     * locks) but the key insight is that each change will cause
-     * evtchn_upcall_pending to be polled.
-     * 
-     * NB2. We save the running flag across the unblock to avoid a needless
-     * IPI for domains that we IPI'd to unblock.
-     */
-    int running = v->is_running;
-    vcpu_unblock(v);
-    if ( running )
-        smp_send_event_check_cpu(v->processor);
-}
-
-static inline void vcpu_mark_events_pending(struct vcpu *v)
-{
-    int already_pending = test_and_set_bit(
-        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
-
-    if ( already_pending )
-        return;
-
-    if ( is_hvm_vcpu(v) )
-        hvm_assert_evtchn_irq(v);
-    else
-        vcpu_kick(v);
-}
+void vcpu_kick(struct vcpu *v);
+void vcpu_mark_events_pending(struct vcpu *v);
 
 int hvm_local_events_need_delivery(struct vcpu *v);
 static inline int local_events_need_delivery(void)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/fixmap.h
--- a/xen/include/asm-x86/fixmap.h      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/fixmap.h      Tue Nov 04 12:43:19 2008 +0900
@@ -29,6 +29,7 @@
  * from the end of virtual memory backwards.
  */
 enum fixed_addresses {
+    FIX_RESERVED, /* Index 0 is reserved since fix_to_virt(0) > FIXADDR_TOP. */
 #ifdef __i386__
     FIX_PAE_HIGHMEM_0,
     FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1,
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/hvm/vmx/vpmu.h
--- a/xen/include/asm-x86/hvm/vmx/vpmu.h        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/hvm/vmx/vpmu.h        Tue Nov 04 12:43:19 2008 +0900
@@ -67,7 +67,7 @@ struct vpmu_struct {
 #define VPMU_CONTEXT_ALLOCATED              0x1
 #define VPMU_CONTEXT_LOADED                 0x2
 #define VPMU_RUNNING                        0x4
-
+#define PASSIVE_DOMAIN_ALLOCATED           0x8
 int vpmu_do_wrmsr(struct cpu_user_regs *regs);
 int vpmu_do_rdmsr(struct cpu_user_regs *regs);
 int vpmu_do_interrupt(struct cpu_user_regs *regs);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/hvm/vmx/vpmu_core2.h
--- a/xen/include/asm-x86/hvm/vmx/vpmu_core2.h  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/hvm/vmx/vpmu_core2.h  Tue Nov 04 12:43:19 2008 +0900
@@ -23,28 +23,6 @@
 #ifndef __ASM_X86_HVM_VPMU_CORE_H_
 #define __ASM_X86_HVM_VPMU_CORE_H_
 
-/* Core 2 Non-architectual Performance Counter MSRs. */
-u32 core2_counters_msr[] =   {
-    MSR_CORE_PERF_FIXED_CTR0,
-    MSR_CORE_PERF_FIXED_CTR1,
-    MSR_CORE_PERF_FIXED_CTR2};
-
-/* Core 2 Non-architectual Performance Control MSRs. */
-u32 core2_ctrls_msr[] = {
-    MSR_CORE_PERF_FIXED_CTR_CTRL,
-    MSR_IA32_PEBS_ENABLE,
-    MSR_IA32_DS_AREA};
-
-struct pmumsr core2_counters = {
-    3,
-    core2_counters_msr
-};
-
-struct pmumsr core2_ctrls = {
-    3,
-    core2_ctrls_msr
-};
-
 struct arch_msr_pair {
     u64 counter;
     u64 control;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/hvm/vpt.h
--- a/xen/include/asm-x86/hvm/vpt.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/hvm/vpt.h     Tue Nov 04 12:43:19 2008 +0900
@@ -32,41 +32,6 @@
 #include <asm/hvm/irq.h>
 #include <public/hvm/save.h>
 
-struct HPETState;
-struct HPET_timer_fn_info {
-    struct HPETState *hs;
-    unsigned int tn;
-};
-
-struct hpet_registers {
-    /* Memory-mapped, software visible registers */
-    uint64_t capability;        /* capabilities */
-    uint64_t config;            /* configuration */
-    uint64_t isr;               /* interrupt status reg */
-    uint64_t mc64;              /* main counter */
-    struct {                    /* timers */
-        uint64_t config;        /* configuration/cap */
-        uint64_t cmp;           /* comparator */
-        uint64_t fsb;           /* FSB route, not supported now */
-    } timers[HPET_TIMER_NUM];
-
-    /* Hidden register state */
-    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
-};
-
-typedef struct HPETState {
-    struct hpet_registers hpet;
-    struct vcpu *vcpu;
-    uint64_t stime_freq;
-    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
-    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
-    uint64_t mc_offset;
-    struct timer timers[HPET_TIMER_NUM];
-    struct HPET_timer_fn_info timer_fn_info[HPET_TIMER_NUM]; 
-    spinlock_t lock;
-} HPETState;
-
-
 /*
  * Abstract layer of periodic time, one short time.
  */
@@ -107,6 +72,34 @@ typedef struct PITState {
     struct periodic_time pt0;
     spinlock_t lock;
 } PITState;
+
+struct hpet_registers {
+    /* Memory-mapped, software visible registers */
+    uint64_t capability;        /* capabilities */
+    uint64_t config;            /* configuration */
+    uint64_t isr;               /* interrupt status reg */
+    uint64_t mc64;              /* main counter */
+    struct {                    /* timers */
+        uint64_t config;        /* configuration/cap */
+        uint64_t cmp;           /* comparator */
+        uint64_t fsb;           /* FSB route, not supported now */
+    } timers[HPET_TIMER_NUM];
+
+    /* Hidden register state */
+    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
+    uint64_t comparator64[HPET_TIMER_NUM]; /* 64 bit running comparator */
+};
+
+typedef struct HPETState {
+    struct hpet_registers hpet;
+    struct vcpu *vcpu;
+    uint64_t stime_freq;
+    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
+    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
+    uint64_t mc_offset;
+    struct periodic_time pt[HPET_TIMER_NUM];
+    spinlock_t lock;
+} HPETState;
 
 typedef struct RTCState {
     /* Hardware state */
@@ -160,13 +153,13 @@ void pt_migrate(struct vcpu *v);
  * The given periodic timer structure must be initialised with zero bytes,
  * except for the 'source' field which must be initialised with the
  * correct PTSRC_ value. The initialised timer structure can then be passed
- * to {create,destroy}_periodic_time() and number of times and in any order.
+ * to {create,destroy}_periodic_time() any number of times and in any order.
  * Note that, for a given periodic timer, invocations of these functions MUST
  * be serialised.
  */
 void create_periodic_time(
-    struct vcpu *v, struct periodic_time *pt, uint64_t period,
-    uint8_t irq, char one_shot, time_cb *cb, void *data);
+    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
+    uint64_t period, uint8_t irq, time_cb *cb, void *data);
 void destroy_periodic_time(struct periodic_time *pt);
 
 int pv_pit_handler(int port, int data, int write);
@@ -185,7 +178,6 @@ void pmtimer_deinit(struct domain *d);
 void pmtimer_deinit(struct domain *d);
 void pmtimer_reset(struct domain *d);
 
-void hpet_migrate_timers(struct vcpu *v);
 void hpet_init(struct vcpu *v);
 void hpet_deinit(struct domain *d);
 void hpet_reset(struct domain *d);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/mm.h  Tue Nov 04 12:43:19 2008 +0900
@@ -61,12 +61,36 @@ struct page_info
         /*
          * When PGT_partial is true then this field is valid and indicates
          * that PTEs in the range [0, @nr_validated_ptes) have been validated.
-         * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
-         * partially validated.
+         * An extra page reference must be acquired (or not dropped) whenever
+         * PGT_partial gets set, and it must be dropped when the flag gets
+         * cleared. This is so that a get() leaving a page in partially
+         * validated state (where the caller would drop the reference acquired
+         * due to the getting of the type [apparently] failing [-EAGAIN])
+         * would not accidentally result in a page left with zero general
+         * reference count, but non-zero type reference count (possible when
+         * the partial get() is followed immediately by domain destruction).
+         * Likewise, the ownership of the single type reference for partially
+         * (in-)validated pages is tied to this flag, i.e. the instance
+         * setting the flag must not drop that reference, whereas the instance
+         * clearing it will have to.
+         *
+         * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
+         * been partially validated. This implies that the general reference
+         * to the page (acquired from get_page_from_lNe()) would be dropped
+         * (again due to the apparent failure) and hence must be re-acquired
+         * when resuming the validation, but must not be dropped when picking
+         * up the page for invalidation.
+         *
+         * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
+         * been partially invalidated. This is basically the opposite case of
+         * above, i.e. the general reference to the page was not dropped in
+         * put_page_from_lNe() (due to the apparent failure), and hence it
+         * must be dropped when the put operation is resumed (and completes),
+         * but it must not be acquired if picking up the page for validation.
          */
         struct {
             u16 nr_validated_ptes;
-            bool_t partial_pte;
+            s8 partial_pte;
         };
 
         /*
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/page.h        Tue Nov 04 12:43:19 2008 +0900
@@ -314,6 +314,9 @@ unsigned long clone_idle_pagetable(struc
 #define __PAGE_HYPERVISOR_NOCACHE \
     (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
 
+#define GRANT_PTE_FLAGS \
+    (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX | _PAGE_GNTTAB)
+
 #ifndef __ASSEMBLY__
 
 static inline int get_order_from_bytes(paddr_t size)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/softirq.h
--- a/xen/include/asm-x86/softirq.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/softirq.h     Tue Nov 04 12:43:19 2008 +0900
@@ -3,7 +3,8 @@
 
 #define NMI_MCE_SOFTIRQ        (NR_COMMON_SOFTIRQS + 0)
 #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
+#define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
 
-#define NR_ARCH_SOFTIRQS       2
+#define NR_ARCH_SOFTIRQS       3
 
 #endif /* __ASM_SOFTIRQ_H__ */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/x86_32/page.h
--- a/xen/include/asm-x86/x86_32/page.h Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/x86_32/page.h Tue Nov 04 12:43:19 2008 +0900
@@ -105,9 +105,6 @@ extern unsigned int PAGE_HYPERVISOR_NOCA
 #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
 #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 32) | ((x) & 0xFFF))
 
-#define GRANT_PTE_FLAGS \
-    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB)
-
 /*
  * Disallow unused flag bits plus PAT/PSE, PCD, PWT and GLOBAL.
  * Permit the NX bit if the hardware supports it.
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/x86_64/page.h
--- a/xen/include/asm-x86/x86_64/page.h Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/x86_64/page.h Tue Nov 04 12:43:19 2008 +0900
@@ -119,13 +119,10 @@ typedef l4_pgentry_t root_pgentry_t;
 #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK)
 #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK)
 
-#define COMPAT_L3_DISALLOW_MASK 0xFFFFF1FEU
+#define COMPAT_L3_DISALLOW_MASK 0xFFFFF198U
 
 #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
-
-#define GRANT_PTE_FLAGS \
-    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB|_PAGE_USER)
 
 #define USER_MAPPINGS_ARE_GLOBAL
 #ifdef USER_MAPPINGS_ARE_GLOBAL
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/xenoprof.h
--- a/xen/include/asm-x86/xenoprof.h    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/xenoprof.h    Tue Nov 04 12:43:19 2008 +0900
@@ -64,6 +64,9 @@ void xenoprof_backtrace(
                  "xenoprof/x86 with autotranslated mode enabled"    \
                  "isn't supported yet\n");                          \
     } while (0)
+int passive_domain_do_rdmsr(struct cpu_user_regs *regs);
+int passive_domain_do_wrmsr(struct cpu_user_regs *regs);
+void passive_domain_destroy(struct vcpu *v);
 
 #endif /* __ASM_X86_XENOPROF_H__ */
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/public/features.h
--- a/xen/include/public/features.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/public/features.h     Tue Nov 04 12:43:19 2008 +0900
@@ -59,6 +59,9 @@
 /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
 #define XENFEAT_mmu_pt_update_preserve_ad  5
 
+/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */
+#define XENFEAT_highmem_assist             6
+
 #define XENFEAT_NR_SUBMAPS 1
 
 #endif /* __XEN_PUBLIC_FEATURES_H__ */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/public/trace.h
--- a/xen/include/public/trace.h        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/public/trace.h        Tue Nov 04 12:43:19 2008 +0900
@@ -142,7 +142,9 @@
 #define TRC_HVM_INVLPG64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14)
 #define TRC_HVM_MCE             (TRC_HVM_HANDLER + 0x15)
 #define TRC_HVM_IO_ASSIST       (TRC_HVM_HANDLER + 0x16)
+#define TRC_HVM_IO_ASSIST64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x16)
 #define TRC_HVM_MMIO_ASSIST     (TRC_HVM_HANDLER + 0x17)
+#define TRC_HVM_MMIO_ASSIST64   (TRC_HVM_HANDLER + TRC_64_FLAG + 0x17)
 #define TRC_HVM_CLTS            (TRC_HVM_HANDLER + 0x18)
 #define TRC_HVM_LMSW            (TRC_HVM_HANDLER + 0x19)
 #define TRC_HVM_LMSW64          (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/public/xen.h
--- a/xen/include/public/xen.h  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/public/xen.h  Tue Nov 04 12:43:19 2008 +0900
@@ -231,6 +231,13 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
  * cmd: MMUEXT_SET_LDT
  * linear_addr: Linear address of LDT base (NB. must be page-aligned).
  * nr_ents: Number of entries in LDT.
+ *
+ * cmd: MMUEXT_CLEAR_PAGE
+ * mfn: Machine frame number to be cleared.
+ *
+ * cmd: MMUEXT_COPY_PAGE
+ * mfn: Machine frame number of the destination page.
+ * src_mfn: Machine frame number of the source page.
  */
 #define MMUEXT_PIN_L1_TABLE      0
 #define MMUEXT_PIN_L2_TABLE      1
@@ -247,12 +254,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #define MMUEXT_FLUSH_CACHE      12
 #define MMUEXT_SET_LDT          13
 #define MMUEXT_NEW_USER_BASEPTR 15
+#define MMUEXT_CLEAR_PAGE       16
+#define MMUEXT_COPY_PAGE        17
 
 #ifndef __ASSEMBLY__
 struct mmuext_op {
     unsigned int cmd;
     union {
-        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
+        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
+         * CLEAR_PAGE, COPY_PAGE */
         xen_pfn_t     mfn;
         /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
         unsigned long linear_addr;
@@ -266,6 +276,8 @@ struct mmuext_op {
 #else
         void *vcpumask;
 #endif
+        /* COPY_PAGE */
+        xen_pfn_t src_mfn;
     } arg2;
 };
 typedef struct mmuext_op mmuext_op_t;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/cpuidle.h
--- a/xen/include/xen/cpuidle.h Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/cpuidle.h Tue Nov 04 12:43:19 2008 +0900
@@ -30,12 +30,18 @@
 #define ACPI_PROCESSOR_MAX_POWER        8
 #define CPUIDLE_NAME_LEN                16
 
+#define ACPI_CSTATE_EM_NONE     0
+#define ACPI_CSTATE_EM_SYSIO    1
+#define ACPI_CSTATE_EM_FFH      2
+#define ACPI_CSTATE_EM_HALT     3
+
 struct acpi_processor_cx
 {
+    u8 idx;
     u8 valid;
     u8 type;
     u32 address;
-    u8 space_id;
+    u8 entry_method; /* ACPI_CSTATE_EM_xxx */
     u32 latency;
     u32 latency_ticks;
     u32 power;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/domain_page.h
--- a/xen/include/xen/domain_page.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/domain_page.h     Tue Nov 04 12:43:19 2008 +0900
@@ -24,7 +24,7 @@ void *map_domain_page(unsigned long mfn)
  * Pass a VA within a page previously mapped in the context of the
  * currently-executing VCPU via a call to map_domain_page().
  */
-void unmap_domain_page(void *va);
+void unmap_domain_page(const void *va);
 
 /*
  * Similar to the above calls, except the mapping is accessible in all
@@ -32,7 +32,7 @@ void unmap_domain_page(void *va);
  * mappings can also be unmapped from any context.
  */
 void *map_domain_page_global(unsigned long mfn);
-void unmap_domain_page_global(void *va);
+void unmap_domain_page_global(const void *va);
 
 #define DMCACHE_ENTRY_VALID 1U
 #define DMCACHE_ENTRY_HELD  2U
@@ -75,7 +75,7 @@ map_domain_page_with_cache(unsigned long
 }
 
 static inline void
-unmap_domain_page_with_cache(void *va, struct domain_mmap_cache *cache)
+unmap_domain_page_with_cache(const void *va, struct domain_mmap_cache *cache)
 {
     ASSERT(cache != NULL);
     cache->flags &= ~DMCACHE_ENTRY_HELD;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/spinlock.h
--- a/xen/include/xen/spinlock.h        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/spinlock.h        Tue Nov 04 12:43:19 2008 +0900
@@ -5,21 +5,38 @@
 #include <asm/system.h>
 #include <asm/spinlock.h>
 
+#ifndef NDEBUG
+struct lock_debug {
+    int irq_safe; /* +1: IRQ-safe; 0: not IRQ-safe; -1: don't know yet */
+};
+#define _LOCK_DEBUG { -1 }
+void spin_debug_enable(void);
+void spin_debug_disable(void);
+#else
+struct lock_debug { };
+#define _LOCK_DEBUG { }
+#define spin_debug_enable() ((void)0)
+#define spin_debug_disable() ((void)0)
+#endif
+
 typedef struct {
     raw_spinlock_t raw;
     u16 recurse_cpu:12;
     u16 recurse_cnt:4;
+    struct lock_debug debug;
 } spinlock_t;
 
-#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0 }
+
+#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG }
 #define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED
 #define spin_lock_init(l) (*(l) = (spinlock_t)SPIN_LOCK_UNLOCKED)
 
 typedef struct {
     raw_rwlock_t raw;
+    struct lock_debug debug;
 } rwlock_t;
 
-#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED }
+#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED, _LOCK_DEBUG }
 #define DEFINE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED
 #define rwlock_init(l) (*(l) = (rwlock_t)RW_LOCK_UNLOCKED)
 
@@ -34,6 +51,7 @@ int _spin_is_locked(spinlock_t *lock);
 int _spin_is_locked(spinlock_t *lock);
 int _spin_trylock(spinlock_t *lock);
 void _spin_barrier(spinlock_t *lock);
+void _spin_barrier_irq(spinlock_t *lock);
 
 void _spin_lock_recursive(spinlock_t *lock);
 void _spin_unlock_recursive(spinlock_t *lock);
@@ -67,6 +85,7 @@ void _write_unlock_irqrestore(rwlock_t *
 
 /* Ensure a lock is quiescent between two critical operations. */
 #define spin_barrier(l)               _spin_barrier(l)
+#define spin_barrier_irq(l)           _spin_barrier_irq(l)
 
 /*
  * spin_[un]lock_recursive(): Use these forms when the lock can (safely!) be
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/time.h
--- a/xen/include/xen/time.h    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/time.h    Tue Nov 04 12:43:19 2008 +0900
@@ -52,6 +52,7 @@ struct tm gmtime(unsigned long t);
 #define SECONDS(_s)     ((s_time_t)((_s)  * 1000000000ULL))
 #define MILLISECS(_ms)  ((s_time_t)((_ms) * 1000000ULL))
 #define MICROSECS(_us)  ((s_time_t)((_us) * 1000ULL))
+#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
 
 extern void update_vcpu_system_time(struct vcpu *v);
 extern void update_domain_wallclock_time(struct domain *d);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/timer.h
--- a/xen/include/xen/timer.h   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/timer.h   Tue Nov 04 12:43:19 2008 +0900
@@ -15,12 +15,13 @@ struct timer {
 struct timer {
     /* System time expiry value (nanoseconds since boot). */
     s_time_t expires;
+    s_time_t expires_end;
 
     /* Position in active-timer data structure. */
     union {
         /* Timer-heap offset. */
         unsigned int heap_offset;
-        /* Overflow linked list. */
+        /* Linked list. */
         struct timer *list_next;
     };
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xlat.lst
--- a/xen/include/xlat.lst      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xlat.lst      Tue Nov 04 12:43:19 2008 +0900
@@ -56,6 +56,6 @@
 !      processor_flags                 platform.h
 !      processor_power                 platform.h
 !      pct_register                    platform.h
-!      processor_px                    platform.h
+?      processor_px                    platform.h
 !      psd_package                     platform.h
 !      processor_performance           platform.h

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>