WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] merge

# HG changeset patch
# User awilliam@xxxxxxxxxxx
# Node ID 673f62edbfbe4098ea1d5a34d8a77667da762090
# Parent  88f97bb8f3ae7e0fb85dbe8fb420d7f02f844a34
# Parent  d8451bb6278cb5f3f477dd9392213be7c66730b4
merge

diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen0_x86_32
--- a/buildconfigs/linux-defconfig_xen0_x86_32  Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen0_x86_32  Wed Mar  1 19:47:25 2006
@@ -1320,6 +1320,7 @@
 # CONFIG_XEN_BLKDEV_TAP_BE is not set
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
 # CONFIG_XEN_TPMDEV_BACKEND is not set
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen0_x86_64
--- a/buildconfigs/linux-defconfig_xen0_x86_64  Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen0_x86_64  Wed Mar  1 19:47:25 2006
@@ -1244,6 +1244,7 @@
 # CONFIG_XEN_BLKDEV_TAP_BE is not set
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
 # CONFIG_XEN_TPMDEV_BACKEND is not set
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen_x86_32
--- a/buildconfigs/linux-defconfig_xen_x86_32   Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen_x86_32   Wed Mar  1 19:47:25 2006
@@ -2986,6 +2986,7 @@
 # CONFIG_XEN_BLKDEV_TAP_BE is not set
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
 # CONFIG_XEN_TPMDEV_BACKEND is not set
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen_x86_64
--- a/buildconfigs/linux-defconfig_xen_x86_64   Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen_x86_64   Wed Mar  1 19:47:25 2006
@@ -2656,6 +2656,7 @@
 # CONFIG_XEN_BLKDEV_TAP_BE is not set
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
 # CONFIG_XEN_TPMDEV_BACKEND is not set
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/mk.linux-2.6-xen
--- a/buildconfigs/mk.linux-2.6-xen     Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/mk.linux-2.6-xen     Wed Mar  1 19:47:25 2006
@@ -2,8 +2,8 @@
 OS           = linux
 
 LINUX_SERIES = 2.6
-LINUX_VER    = 2.6.16-rc4
-LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc4.bz2
+LINUX_VER    = 2.6.16-rc5
+LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc5.bz2
 LINUX_PDIR = linux-$(LINUX_VER)
 
 EXTRAVERSION ?= xen
@@ -34,7 +34,7 @@
        touch $(@D)/.hgskip
        touch $@
 
-pristine-linux-%.16-rc4/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs
+pristine-linux-%.16-rc5/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs
        touch $@ # update timestamp to avoid rebuild
 
 $(LINUX_DIR)/include/linux/autoconf.h: ref-$(OS)-$(LINUX_VER)/.valid-ref
diff -r 88f97bb8f3ae -r 673f62edbfbe docs/src/user.tex
--- a/docs/src/user.tex Wed Mar  1 17:01:54 2006
+++ b/docs/src/user.tex Wed Mar  1 19:47:25 2006
@@ -626,7 +626,7 @@
 allow you to monitor and log the Xen boot process via serial console and
 can be very useful in debugging.
 
-%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1
+%% kernel /boot/xen-2.0.gz dom0_mem=131072 console=com1,vga com1=115200,8n1
 %% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro
 
 In order to configure Xen serial console output, it is necessary to
@@ -637,8 +637,9 @@
 \end{verbatim}}
 \end{quote}
 
-This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 1
-stop bit and no parity. Modify these parameters for your environment.
+This configures Xen to output on COM1 at 115,200 baud, 8 data bits, no
+parity and 1 stop bit. Modify these parameters for your environment.
+See Section~\ref{s:xboot} for an explanation of all boot parameters.
 
 One can also configure XenLinux to share the serial console; to achieve
 this append ``\path{console=ttyS0}'' to your module line.
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/Kconfig
--- a/linux-2.6-xen-sparse/arch/i386/Kconfig    Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/Kconfig    Wed Mar  1 19:47:25 2006
@@ -770,7 +770,7 @@
 
 config HOTPLUG_CPU
        bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
-       depends on SMP && HOTPLUG && EXPERIMENTAL
+       depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
        ---help---
          Say Y here to experiment with turning CPUs off and on.  CPUs
          can be controlled through /sys/devices/system/cpu.
@@ -1122,6 +1122,7 @@
 
 config KPROBES
        bool "Kprobes (EXPERIMENTAL)"
+       depends on EXPERIMENTAL && MODULES
        help
          Kprobes allows you to trap at almost any kernel address and
          execute a callback function.  register_kprobe() establishes
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/i386/kernel/Makefile    Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/Makefile    Wed Mar  1 19:47:25 2006
@@ -7,7 +7,7 @@
 obj-y  := process.o semaphore.o signal.o entry.o traps.o irq.o \
                ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
                pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-               quirks.o i8237.o
+               quirks.o i8237.o topology.o
 
 obj-y                          += cpu/
 obj-y                          += timers/
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c     Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c     Wed Mar  1 
19:47:25 2006
@@ -44,9 +44,6 @@
 extern int gsi_irq_sharing(int gsi);
 #include <asm/proto.h>
 
-static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { 
return 0; }
-
-
 #else                          /* X86 */
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c    Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c    Wed Mar  1 
19:47:25 2006
@@ -4,6 +4,7 @@
 #include <linux/smp.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
+#include <linux/bootmem.h>
 #include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
@@ -18,6 +19,9 @@
 #include <asm/hypervisor.h>
 
 #include "cpu.h"
+
+DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
+EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
 
 #ifndef CONFIG_XEN
 DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
@@ -598,6 +602,8 @@
        struct tss_struct * t = &per_cpu(init_tss, cpu);
 #endif
        struct thread_struct *thread = &current->thread;
+       struct desc_struct *gdt;
+       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
        if (cpu_test_and_set(cpu, cpu_initialized)) {
                printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -614,7 +620,54 @@
                set_in_cr4(X86_CR4_TSD);
        }
 
-       cpu_gdt_init(&cpu_gdt_descr[cpu]);
+#ifndef CONFIG_XEN
+       /*
+        * This is a horrible hack to allocate the GDT.  The problem
+        * is that cpu_init() is called really early for the boot CPU
+        * (and hence needs bootmem) but much later for the secondary
+        * CPUs, when bootmem will have gone away
+        */
+       if (NODE_DATA(0)->bdata->node_bootmem_map) {
+               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
+               /* alloc_bootmem_pages panics on failure, so no check */
+               memset(gdt, 0, PAGE_SIZE);
+       } else {
+               gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
+               if (unlikely(!gdt)) {
+                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+                       for (;;)
+                               local_irq_enable();
+               }
+       }
+
+       /*
+        * Initialize the per-CPU GDT with the boot GDT,
+        * and set up the GDT descriptor:
+        */
+       memcpy(gdt, cpu_gdt_table, GDT_SIZE);
+
+       /* Set up GDT entry for 16bit stack */
+       *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
+               ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
+               ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
+               (CPU_16BIT_STACK_SIZE - 1);
+
+       cpu_gdt_descr->size = GDT_SIZE - 1;
+       cpu_gdt_descr->address = (unsigned long)gdt;
+#else
+       if (cpu == 0 && cpu_gdt_descr->address == 0) {
+               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
+               /* alloc_bootmem_pages panics on failure, so no check */
+               memset(gdt, 0, PAGE_SIZE);
+
+               memcpy(gdt, cpu_gdt_table, GDT_SIZE);
+               
+               cpu_gdt_descr->size = GDT_SIZE;
+               cpu_gdt_descr->address = (unsigned long)gdt;
+       }
+#endif
+
+       cpu_gdt_init(cpu_gdt_descr);
 
        /*
         * Set up and load the per-CPU TSS and LDT
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S
--- a/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S  Wed Mar  1 19:47:25 2006
@@ -87,19 +87,9 @@
  */
 .data
 
-       ALIGN
-       .word 0                         # 32 bit align gdt_desc.address
-       .globl cpu_gdt_descr
-cpu_gdt_descr:
-       .word GDT_SIZE
-       .long cpu_gdt_table
-
-       .fill NR_CPUS-1,8,0             # space for the other GDT descriptors
-
 /*
  * The Global Descriptor Table contains 28 quadwords, per-CPU.
  */
-       .align PAGE_SIZE_asm
 ENTRY(cpu_gdt_table)
        .quad 0x0000000000000000        /* NULL descriptor */
        .quad 0x0000000000000000        /* 0x0b reserved */
@@ -148,10 +138,6 @@
        .quad 0x0000000000000000        /* 0xf0 - unused */
        .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault 
TSS */
 
-       /* Be sure this is zeroed to avoid false validations in Xen */
-       .fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0
-
-
 /*
  * __xen_guest information
  */
@@ -176,6 +162,7 @@
        .ascii  ",FEATURES=writable_page_tables"
        .ascii           "|writable_descriptor_tables"
        .ascii           "|auto_translated_physmap"
+       .ascii           "|pae_pgdir_above_4gb"
        .ascii           "|supervisor_mode_kernel"
 #ifdef CONFIG_X86_PAE
        .ascii  ",PAE=yes"
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c       Wed Mar  1 
19:47:25 2006
@@ -2634,8 +2634,10 @@
                spin_unlock_irqrestore(&ioapic_lock, flags);
 
                /* Sanity check */
-               if (reg_00.bits.ID != apic_id)
-                       panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
+               if (reg_00.bits.ID != apic_id) {
+                       printk("IOAPIC[%d]: Unable to change apic_id!\n", 
ioapic);
+                       return -1;
+               }
        }
 
        apic_printk(APIC_VERBOSE, KERN_INFO
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c       Wed Mar  1 
19:47:25 2006
@@ -935,6 +935,7 @@
        u32                     gsi_base)
 {
        int                     idx = 0;
+       int                     tmpid;
 
        if (nr_ioapics >= MAX_IO_APICS) {
                printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
@@ -957,9 +958,14 @@
        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
 #endif
        if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 
(boot_cpu_data.x86 < 15))
-               mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
+               tmpid = io_apic_get_unique_id(idx, id);
        else
-               mp_ioapics[idx].mpc_apicid = id;
+               tmpid = id;
+       if (tmpid == -1) {
+               nr_ioapics--;
+               return;
+       }
+       mp_ioapics[idx].mpc_apicid = tmpid;
        mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
        
        /* 
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c   Wed Mar  1 19:47:25 2006
@@ -898,12 +898,6 @@
        unsigned long start_eip;
        unsigned short nmi_high = 0, nmi_low = 0;
 
-       if (!cpu_gdt_descr[cpu].address &&
-           !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
-               printk("Failed to allocate GDT for CPU %d\n", cpu);
-               return 1;
-       }
-
        ++cpucount;
 
        /*
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c  Wed Mar  1 19:47:25 2006
@@ -48,6 +48,8 @@
 #include <linux/mca.h>
 #include <linux/sysctl.h>
 #include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <linux/posix-timers.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -70,6 +72,7 @@
 #include <asm/arch_hooks.h>
 
 #include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
 
 #if defined (__i386__)
 #include <asm/i8259.h>
@@ -122,6 +125,13 @@
 /* Keep track of last time we did processing/updating of jiffies and xtime. */
 static u64 processed_system_time;   /* System time (ns) at last processing. */
 static DEFINE_PER_CPU(u64, processed_system_time);
+
+/* How much CPU time was spent blocked and how much was 'stolen'? */
+static DEFINE_PER_CPU(u64, processed_stolen_time);
+static DEFINE_PER_CPU(u64, processed_blocked_time);
+
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
 
 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
 #define NS_PER_TICK (1000000000LL/HZ)
@@ -477,14 +487,45 @@
 
 EXPORT_SYMBOL(do_settimeofday);
 
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static void sync_xen_wallclock(unsigned long dummy);
+static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
+static void sync_xen_wallclock(unsigned long dummy)
+{
+       time_t sec;
+       s64 nsec;
+       dom0_op_t op;
+
+       if (!ntp_synced() || independent_wallclock ||
+           !(xen_start_info->flags & SIF_INITDOMAIN))
+               return;
+
+       write_seqlock_irq(&xtime_lock);
+
+       sec  = xtime.tv_sec;
+       nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
+       __normalize_time(&sec, &nsec);
+
+       op.cmd = DOM0_SETTIME;
+       op.u.settime.secs        = sec;
+       op.u.settime.nsecs       = nsec;
+       op.u.settime.system_time = processed_system_time;
+       HYPERVISOR_dom0_op(&op);
+
+       update_wallclock();
+
+       write_sequnlock_irq(&xtime_lock);
+
+       /* Once per minute. */
+       mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
+}
+
 static int set_rtc_mmss(unsigned long nowtime)
 {
        int retval;
 
        WARN_ON(irqs_disabled());
 
-       if (!(xen_start_info->flags & SIF_INITDOMAIN))
+       if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
                return 0;
 
        /* gets recalled with irq locally disabled */
@@ -497,12 +538,6 @@
 
        return retval;
 }
-#else
-static int set_rtc_mmss(unsigned long nowtime)
-{
-       return 0;
-}
-#endif
 
 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
  *             Note: This function is required to return accurate
@@ -567,19 +602,37 @@
 
 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-       s64 delta, delta_cpu;
+       s64 delta, delta_cpu, stolen, blocked;
+       u64 sched_time;
        int i, cpu = smp_processor_id();
        struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
 
        write_seqlock(&xtime_lock);
 
        do {
                get_time_values_from_xen();
 
+               /* Obtain a consistent snapshot of elapsed wallclock cycles. */
                delta = delta_cpu = 
                        shadow->system_timestamp + get_nsec_offset(shadow);
                delta     -= processed_system_time;
                delta_cpu -= per_cpu(processed_system_time, cpu);
+
+               /*
+                * Obtain a consistent snapshot of stolen/blocked cycles. We
+                * can use state_entry_time to detect if we get preempted here.
+                */
+               do {
+                       sched_time = runstate->state_entry_time;
+                       barrier();
+                       stolen = runstate->time[RUNSTATE_runnable] +
+                               runstate->time[RUNSTATE_offline] -
+                               per_cpu(processed_stolen_time, cpu);
+                       blocked = runstate->time[RUNSTATE_blocked] -
+                               per_cpu(processed_blocked_time, cpu);
+                       barrier();
+               } while (sched_time != runstate->state_entry_time);
        }
        while (!time_values_up_to_date(cpu));
 
@@ -612,18 +665,67 @@
        write_sequnlock(&xtime_lock);
 
        /*
-         * Local CPU jiffy work. No need to hold xtime_lock, and I'm not sure
-         * if there is risk of deadlock if we do (since update_process_times
-         * may do scheduler rebalancing work and thus acquire runqueue locks).
-         */
-       while (delta_cpu >= NS_PER_TICK) {
-               delta_cpu -= NS_PER_TICK;
-               per_cpu(processed_system_time, cpu) += NS_PER_TICK;
-               update_process_times(user_mode(regs));
-               profile_tick(CPU_PROFILING, regs);
-       }
+        * Account stolen ticks.
+        * HACK: Passing NULL to account_steal_time()
+        * ensures that the ticks are accounted as stolen.
+        */
+       if (stolen > 0) {
+               delta_cpu -= stolen;
+               do_div(stolen, NS_PER_TICK);
+               per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
+               per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
+               account_steal_time(NULL, (cputime_t)stolen);
+       }
+
+       /*
+        * Account blocked ticks.
+        * HACK: Passing idle_task to account_steal_time()
+        * ensures that the ticks are accounted as idle/wait.
+        */
+       if (blocked > 0) {
+               delta_cpu -= blocked;
+               do_div(blocked, NS_PER_TICK);
+               per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
+               per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
+               account_steal_time(idle_task(cpu), (cputime_t)blocked);
+       }
+
+       /* Account user/system ticks. */
+       if (delta_cpu > 0) {
+               do_div(delta_cpu, NS_PER_TICK);
+               per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
+               if (user_mode(regs))
+                       account_user_time(current, (cputime_t)delta_cpu);
+               else
+                       account_system_time(current, HARDIRQ_OFFSET,
+                                           (cputime_t)delta_cpu);
+       }
+
+       /* Local timer processing (see update_process_times()). */
+       run_local_timers();
+       if (rcu_pending(cpu))
+               rcu_check_callbacks(cpu, user_mode(regs));
+       scheduler_tick();
+       run_posix_cpu_timers(current);
 
        return IRQ_HANDLED;
+}
+
+static void init_missing_ticks_accounting(int cpu)
+{
+       struct vcpu_register_runstate_memory_area area;
+       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+
+       memset(runstate, 0, sizeof(*runstate));
+
+       area.addr.v = runstate;
+       HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+
+       per_cpu(processed_blocked_time, cpu) =
+               runstate->time[RUNSTATE_blocked];
+       per_cpu(processed_stolen_time, cpu) =
+               runstate->time[RUNSTATE_runnable] +
+               runstate->time[RUNSTATE_offline];
 }
 
 /* not static: needed by APM */
@@ -691,6 +793,7 @@
 void notify_arch_cmos_timer(void)
 {
        mod_timer(&sync_cmos_timer, jiffies + 1);
+       mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
 }
 
 static long clock_cmos_diff, sleep_start;
@@ -814,6 +917,7 @@
 
        processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;
+       init_missing_ticks_accounting(0);
 
        update_wallclock();
 
@@ -891,6 +995,7 @@
 
        processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;
+       init_missing_ticks_accounting(0);
 
        update_wallclock();
 }
@@ -909,6 +1014,7 @@
                /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
                per_cpu(processed_system_time, cpu) = 
                        per_cpu(shadow_time, 0).system_timestamp;
+               init_missing_ticks_accounting(cpu);
        } while (read_seqretry(&xtime_lock, seq));
 
        sprintf(timer_name[cpu], "timer%d", cpu);
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile
--- a/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile  Wed Mar  1 19:47:25 2006
@@ -2,6 +2,4 @@
 # Makefile for the linux kernel.
 #
 
-obj-y                          := setup.o topology.o
-  
-topology-y                     := ../mach-default/topology.o
+obj-y                          := setup.o
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Wed Mar  1 19:47:25 2006
@@ -454,6 +454,7 @@
 
 static int disable_nx __initdata = 0;
 u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL(__supported_pte_mask);
 
 /*
  * noexec = on|off
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/x86_64/Kconfig
--- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig  Wed Mar  1 19:47:25 2006
@@ -381,21 +381,6 @@
          as it is off-chip.  You can find the HPET spec at
          <http://www.intel.com/hardwaredesign/hpetspec.htm>.
 
-config X86_PM_TIMER
-       bool "PM timer" if EMBEDDED
-       depends on ACPI && !X86_64_XEN
-       default y
-       help
-         Support the ACPI PM timer for time keeping. This is slow,
-         but is useful on some chipsets without HPET on systems with more
-         than one CPU. On a single processor or single socket multi core
-         system it is normally not required.
-         When the PM timer is active 64bit vsyscalls are disabled
-         and should not be enabled (/proc/sys/kernel/vsyscall64 should
-         not be changed).
-         The kernel selects the PM timer only as a last resort, so it is
-         useful to enable just in case.
-
 config HPET_EMULATE_RTC
        bool "Provide RTC interrupt"
        depends on HPET_TIMER && RTC=y
@@ -640,6 +625,7 @@
 
 config KPROBES
        bool "Kprobes (EXPERIMENTAL)"
+       depends on EXPERIMENTAL && MODULES
        help
          Kprobes allows you to trap at almost any kernel address and
          execute a callback function.  register_kprobe() establishes
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile  Wed Mar  1 19:47:25 2006
@@ -45,7 +45,7 @@
 
 bootflag-y                     += ../../i386/kernel/bootflag.o
 cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../i386/kernel/cpuid.o
-topology-y                     += ../../i386/mach-default/topology.o
+topology-y                     += ../../i386/kernel/topology.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
 intel_cacheinfo-y              += ../../i386/kernel/cpu/intel_cacheinfo.o
 quirks-y                       += ../../i386/kernel/quirks.o
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c        Wed Mar  1 
19:47:25 2006
@@ -114,6 +114,8 @@
        irq_exit();
 }
 
+int __initdata unsync_tsc_on_multicluster;
+
 /*
  * This interrupt should _never_ happen with our APIC/SMP architecture
  */
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S       Wed Mar  1 
19:47:25 2006
@@ -51,6 +51,7 @@
 #include <asm/page.h>
 #include <asm/errno.h>
 #include <xen/interface/arch-x86_64.h>
+#include <xen/interface/features.h>
 
 #include "irq_vectors.h"
 
@@ -146,16 +147,19 @@
          */
        .macro HYPERVISOR_IRET flag
        testb $3,1*8(%rsp)
-       jnz   1f
+       jnz   2f
        testl $NMI_MASK,2*8(%rsp)
+       jnz   2f
+
+       testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
        jnz   1f
 
        /* Direct iret to kernel space. Correct CS and SS. */
        orb   $3,1*8(%rsp)
        orb   $3,4*8(%rsp)
-       iretq
-
-1:     /* Slow iret via hypervisor. */
+1:     iretq
+
+2:     /* Slow iret via hypervisor. */
        andl  $~NMI_MASK, 16(%rsp)
        pushq $\flag
        jmp  hypercall_page + (__HYPERVISOR_iret * 32)
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c     Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c     Wed Mar  1 
19:47:25 2006
@@ -51,6 +51,8 @@
 int disable_timer_pin_1 __initdata;
 
 #ifndef CONFIG_XEN
+int timer_over_8254 __initdata = 1;
+
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 #endif
@@ -300,6 +302,22 @@
 
 __setup("noapic", disable_ioapic_setup);
 __setup("apic", enable_ioapic_setup);
+
+#ifndef CONFIG_XEN
+static int __init setup_disable_8254_timer(char *s)
+{
+       timer_over_8254 = -1;
+       return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+       timer_over_8254 = 2;
+       return 1;
+}
+
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+#endif /* !CONFIG_XEN */
 
 #include <asm/pci-direct.h>
 #include <linux/pci_ids.h>
@@ -360,27 +378,20 @@
                                        /* RED-PEN skip them on mptables too? */
                                        return;
                                case PCI_VENDOR_ID_ATI:
+
+                               /* This should be actually default, but
+                                  for 2.6.16 let's do it for ATI only where
+                                  it's really needed. */
 #ifndef CONFIG_XEN
-                                       if (apic_runs_main_timer != 0)
-                                               break;
-#ifdef CONFIG_ACPI
-                                       /* Don't do this for laptops right
-                                          right now because their timer
-                                          doesn't necessarily tick in C2/3 */
-                                       if (acpi_fadt.revision >= 3 &&
-                       (acpi_fadt.plvl2_lat + acpi_fadt.plvl3_lat) < 1100) {
-                                               printk(KERN_INFO
-"ATI board detected, but seems to be a laptop. Timer might be shakey, 
sorry\n");
-                                               break;
-                                       }
-#endif                                 
+                                       if (timer_over_8254 == 1) {     
+                                               timer_over_8254 = 0;    
                                        printk(KERN_INFO
-            "ATI board detected. Using APIC/PM timer.\n");
-                                       apic_runs_main_timer = 1;
-                                       nohpet = 1;
+               "ATI board detected. Disabling timer routing over 8254.\n");
+                                       }       
 #endif
                                        return;
                                } 
+
 
                                /* No multi-function device? */
                                type = read_pci_config_byte(num,slot,func,
@@ -1848,6 +1859,8 @@
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
  * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
  * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for modern platforms only.
  */
 static inline void check_timer(void)
 {
@@ -1870,7 +1883,8 @@
         */
        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
        init_8259A(1);
-       enable_8259A_irq(0);
+       if (timer_over_8254 > 0)
+               enable_8259A_irq(0);
 
        pin1  = find_isa_irq_pin(0, mp_INT);
        apic1 = find_isa_irq_apic(0, mp_INT);
@@ -1925,7 +1939,7 @@
        }
        printk(" failed.\n");
 
-       if (nmi_watchdog) {
+       if (nmi_watchdog == NMI_IO_APIC) {
                printk(KERN_WARNING "timer doesn't work through the IO-APIC - 
disabling NMI Watchdog!\n");
                nmi_watchdog = 0;
        }
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c       Wed Mar  1 
19:47:25 2006
@@ -462,6 +462,12 @@
                else if(!memcmp(from, "elfcorehdr=", 11))
                        elfcorehdr_addr = memparse(from+11, &from);
 #endif
+
+#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
+               else if (!memcmp(from, "additional_cpus=", 16))
+                       setup_additional_cpus(from+16);
+#endif
+
        next_char:
                c = *(from++);
                if (!c)
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/acpi/Kconfig
--- a/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Mar  1 19:47:25 2006
@@ -247,7 +247,7 @@
          Enter the full path name to the file wich includes the AmlCode 
declaration.
 
 config ACPI_BLACKLIST_YEAR
-       int "Disable ACPI for systems before Jan 1st this year" if X86
+       int "Disable ACPI for systems before Jan 1st this year" if X86_32
        default 0
        help
          enter a 4-digit year, eg. 2001 to disable ACPI by default
@@ -285,9 +285,9 @@
          dump your ACPI DSDT table using /proc/acpi/dsdt.
 
 config X86_PM_TIMER
-       bool "Power Management Timer Support"
-       depends on X86
-       depends on !X86_64
+       bool "Power Management Timer Support" if EMBEDDED
+       depends on X86
+       depends on !XEN
        default y
        help
          The Power Management Timer is available on all ACPI-capable,
@@ -298,9 +298,8 @@
          voltage scaling, unlike the commonly used Time Stamp Counter
          (TSC) timing source.
 
-         So, if you see messages like 'Losing too many ticks!' in the
-         kernel logs, and/or you are using this on a notebook which
-         does not yet have an HPET, you should say "Y" here.
+         You should nearly always say Y here because many modern
+         systems require this timer. 
 
 config ACPI_CONTAINER
        tristate "ACPI0004,PNP0A05 and PNP0A06 Container Driver (EXPERIMENTAL)"
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/video/Kconfig
--- a/linux-2.6-xen-sparse/drivers/video/Kconfig        Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/video/Kconfig        Wed Mar  1 19:47:25 2006
@@ -520,7 +520,7 @@
 config FB_GBE_MEM
        int "Video memory size in MB"
        depends on FB_GBE
-       default 8
+       default 4
        help
          This is the amount of memory reserved for the framebuffer,
          which can be any value between 1MB and 8MB.
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/Kconfig
--- a/linux-2.6-xen-sparse/drivers/xen/Kconfig  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/Kconfig  Wed Mar  1 19:47:25 2006
@@ -68,7 +68,7 @@
        default n
 
 config XEN_BLKDEV_BACKEND
-       bool "Block-device backend driver"
+       tristate "Block-device backend driver"
        default y
        help
          The block-device backend driver allows the kernel to export its
@@ -76,7 +76,7 @@
          interface.
 
 config XEN_BLKDEV_TAP_BE
-        bool "Block Tap support for backend driver (DANGEROUS)"
+        tristate "Block Tap support for backend driver (DANGEROUS)"
         depends on XEN_BLKDEV_BACKEND
         default n
         help
@@ -89,7 +89,7 @@
           modified to use grant tables.
 
 config XEN_NETDEV_BACKEND
-       bool "Network-device backend driver"
+       tristate "Network-device backend driver"
        default y
        help
          The network-device backend driver allows the kernel to export its
@@ -109,8 +109,16 @@
          are unsure; or if you experience network hangs when this option is
          enabled; then you must say N here.
 
+config XEN_NETDEV_LOOPBACK
+       tristate "Network-device loopback driver"
+       depends on XEN_NETDEV_BACKEND
+       default y
+       help
+         A two-interface loopback device to emulate a local netfront-netback
+         connection.
+
 config XEN_TPMDEV_BACKEND
-       bool "TPM-device backend driver"
+       tristate "TPM-device backend driver"
        default n
        help
          The TPM-device backend driver
@@ -145,7 +153,7 @@
          (domain 0), then you almost certainly want to say Y here.
 
 config XEN_BLKDEV_TAP
-       bool "Block device tap driver"
+       tristate "Block device tap driver"
        default n
        help
          This driver allows a VM to interact on block device channels
@@ -154,7 +162,7 @@
          space.  Odds are that you want to say N here.
 
 config XEN_TPMDEV_FRONTEND
-       bool "TPM-device frontend driver"
+       tristate "TPM-device frontend driver"
        default n
        select TCG_TPM
        select TCG_XEN
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/blkback/Makefile
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Wed Mar  1 19:47:25 2006
@@ -1,2 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
 
-obj-y  := blkback.o xenbus.o interface.o vbd.o
+blkbk-y        := blkback.o xenbus.o interface.o vbd.o
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c        Wed Mar  1 
19:47:25 2006
@@ -29,14 +29,10 @@
  * 64 should be enough to keep us competitive with Linux.
  */
 static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
 static int mmap_pages;
-
-static int __init set_blkif_reqs(char *str)
-{
-       get_option(&str, &blkif_reqs);
-       return 1;
-}
-__setup("blkif_reqs=", set_blkif_reqs);
 
 /* Run-time switchable: /sys/module/blkback/parameters/ */
 static unsigned int log_stats = 0;
@@ -574,10 +570,20 @@
                list_add_tail(&pending_reqs[i].free_list, &pending_free);
     
        blkif_xenbus_init();
+       __unsafe(THIS_MODULE);
        return 0;
 }
 
-__initcall(blkif_init);
+module_init(blkif_init);
+
+static void blkif_exit(void)
+{
+       BUG();
+}
+
+module_exit(blkif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/core/skbuff.c
--- a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c    Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c    Wed Mar  1 19:47:25 2006
@@ -16,6 +16,7 @@
 
 /* Referenced in netback.c. */
 /*static*/ kmem_cache_t *skbuff_cachep;
+EXPORT_SYMBOL(skbuff_cachep);
 
 #define MAX_SKBUFF_ORDER 4
 static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/core/smpboot.c
--- a/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c   Wed Mar  1 19:47:25 2006
@@ -150,6 +150,11 @@
 {
        vcpu_guest_context_t ctxt;
        struct task_struct *idle = idle_task(vcpu);
+#ifdef __x86_64__
+       struct desc_ptr *gdt_descr = &cpu_gdt_descr[vcpu];
+#else
+       struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, vcpu);
+#endif
 
        if (vcpu == 0)
                return;
@@ -171,8 +176,8 @@
 
        ctxt.ldt_ents = 0;
 
-       ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[vcpu].address);
-       ctxt.gdt_ents      = cpu_gdt_descr[vcpu].size / 8;
+       ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
+       ctxt.gdt_ents      = gdt_descr->size / 8;
 
 #ifdef __i386__
        ctxt.user_regs.cs = __KERNEL_CS;
@@ -210,6 +215,11 @@
 {
        int cpu;
        struct task_struct *idle;
+#ifdef __x86_64__
+       struct desc_ptr *gdt_descr;
+#else
+       struct Xgt_desc_struct *gdt_descr;
+#endif
 
        cpu_data[0] = boot_cpu_data;
 
@@ -225,6 +235,22 @@
        for_each_cpu_mask (cpu, cpu_possible_map) {
                if (cpu == 0)
                        continue;
+
+#ifdef __x86_64__
+               gdt_descr = &cpu_gdt_descr[cpu];
+#else
+               gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+#endif
+               gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+               if (unlikely(!gdt_descr->address)) {
+                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+                       continue;
+               }
+               gdt_descr->size = GDT_SIZE;
+               memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
+               make_page_readonly(
+                       (void *)gdt_descr->address,
+                       XENFEAT_writable_descriptor_tables);
 
                cpu_data[cpu] = boot_cpu_data;
                cpu_2_logical_apicid[cpu] = cpu;
@@ -241,17 +267,6 @@
 #endif
 
                irq_ctx_init(cpu);
-
-               cpu_gdt_descr[cpu].address =
-                       __get_free_page(GFP_KERNEL|__GFP_ZERO);
-               BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE);
-               cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
-               memcpy((void *)cpu_gdt_descr[cpu].address,
-                      (void *)cpu_gdt_descr[0].address,
-                      cpu_gdt_descr[0].size);
-               make_page_readonly(
-                       (void *)cpu_gdt_descr[cpu].address,
-                       XENFEAT_writable_descriptor_tables);
 
 #ifdef CONFIG_HOTPLUG_CPU
                if (xen_start_info->flags & SIF_INITDOMAIN)
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/net_driver_util.c
--- a/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c        Wed Mar  1 
19:47:25 2006
@@ -30,6 +30,7 @@
 
 #include <linux/if_ether.h>
 #include <linux/err.h>
+#include <linux/module.h>
 #include <xen/net_driver_util.h>
 
 
@@ -54,7 +55,7 @@
        kfree(macstr);
        return 0;
 }
-
+EXPORT_SYMBOL(xen_net_read_mac);
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/netback/Makefile
--- a/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Wed Mar  1 19:47:25 2006
@@ -1,2 +1,5 @@
+obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
+obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
 
-obj-y  := netback.o xenbus.o interface.o loopback.o
+netbk-y   := netback.o xenbus.o interface.o
+netloop-y := loopback.o
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/netback/loopback.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c       Wed Mar  1 
19:47:25 2006
@@ -178,6 +178,23 @@
        return err;
 }
 
+static void __init clean_loopback(int i)
+{
+       struct net_device *dev1, *dev2;
+       char dev_name[IFNAMSIZ];
+
+       sprintf(dev_name, "vif0.%d", i);
+       dev1 = dev_get_by_name(dev_name);
+       sprintf(dev_name, "veth%d", i);
+       dev2 = dev_get_by_name(dev_name);
+       if (dev1 && dev2) {
+               unregister_netdev(dev2);
+               unregister_netdev(dev1);
+               free_netdev(dev2);
+               free_netdev(dev1);
+       }
+}
+
 static int __init loopback_init(void)
 {
        int i, err = 0;
@@ -190,6 +207,18 @@
 }
 
 module_init(loopback_init);
+
+static void __exit loopback_exit(void)
+{
+       int i;
+
+       for (i = nloopbacks; i-- > 0; )
+               clean_loopback(i);
+}
+
+module_exit(loopback_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/netback/netback.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c        Wed Mar  1 
19:47:25 2006
@@ -505,14 +505,12 @@
                        /* Still too big to send right now? Set a callback. */
                        if (txreq.size > netif->remaining_credit) {
                                netif->remaining_credit = 0;
-                               netif->credit_timeout.expires  = 
-                                       next_credit;
                                netif->credit_timeout.data     =
                                        (unsigned long)netif;
                                netif->credit_timeout.function =
                                        tx_credit_callback;
-                               add_timer_on(&netif->credit_timeout,
-                                            smp_processor_id());
+                               __mod_timer(&netif->credit_timeout,
+                                           next_credit);
                                break;
                        }
                }
@@ -811,6 +809,8 @@
                &netif_be_dbg);
 #endif
 
+       __unsafe(THIS_MODULE);
+
        return 0;
 }
 
@@ -821,6 +821,8 @@
 
 module_init(netback_init);
 module_exit(netback_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Wed Mar  1 
19:47:25 2006
@@ -114,6 +114,7 @@
 
        /* Receive-ring batched refills. */
 #define RX_MIN_TARGET 8
+#define RX_DFL_MIN_TARGET 64
 #define RX_MAX_TARGET NET_RX_RING_SIZE
        int rx_min_target, rx_max_target, rx_target;
        struct sk_buff_head rx_batch;
@@ -1102,8 +1103,8 @@
        spin_lock_init(&np->rx_lock);
 
        skb_queue_head_init(&np->rx_batch);
-       np->rx_target     = RX_MIN_TARGET;
-       np->rx_min_target = RX_MIN_TARGET;
+       np->rx_target     = RX_DFL_MIN_TARGET;
+       np->rx_min_target = RX_DFL_MIN_TARGET;
        np->rx_max_target = RX_MAX_TARGET;
 
        init_timer(&np->rx_refill_timer);
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Wed Mar  1 19:47:25 2006
@@ -54,9 +54,11 @@
 void tpmif_disconnect_complete(tpmif_t * tpmif);
 tpmif_t *tpmif_find(domid_t domid, long int instance);
 void tpmif_interface_init(void);
+void tpmif_interface_exit(void);
 void tpmif_schedule_work(tpmif_t * tpmif);
 void tpmif_deschedule_work(tpmif_t * tpmif);
 void tpmif_xenbus_init(void);
+void tpmif_xenbus_exit(void);
 int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
 irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
 int tpmif_vtpm_open(tpmif_t *tpmif, domid_t domain, u32 instance);
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c      Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c      Wed Mar  1 
19:47:25 2006
@@ -186,6 +186,12 @@
                                         0, 0, NULL, NULL);
 }
 
+void __init
+tpmif_interface_exit(void)
+{
+       kmem_cache_destroy(tpmif_cachep);
+}
+
 /*
  * Local variables:
  *  c-file-style: "linux"
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c        Wed Mar  1 
19:47:25 2006
@@ -1092,7 +1092,20 @@
        return 0;
 }
 
-__initcall(tpmback_init);
+module_init(tpmback_init);
+
+static void __exit
+tpmback_exit(void)
+{
+
+       tpmif_xenbus_exit();
+       tpmif_interface_exit();
+       misc_deregister(&ibmvtpms_miscdevice);
+}
+
+module_exit(tpmback_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Wed Mar  1 19:47:25 2006
@@ -317,6 +317,11 @@
        xenbus_register_backend(&tpmback);
 }
 
+void tpmif_xenbus_exit(void)
+{
+       xenbus_unregister_driver(&tpmback);
+}
+
 /*
  * Local variables:
  *  c-file-style: "linux"
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c      Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c      Wed Mar  1 
19:47:25 2006
@@ -480,6 +480,11 @@
        xenbus_register_frontend(&tpmfront);
 }
 
+static void __exit exit_tpm_xenbus(void)
+{
+       xenbus_unregister_driver(&tpmfront);
+}
+
 
 static int
 tpm_allocate_buffers(struct tpm_private *tp)
@@ -700,7 +705,18 @@
        return 0;
 }
 
-__initcall(tpmif_init);
+module_init(tpmif_init);
+
+static void __exit
+tpmif_exit(void)
+{
+       exit_tpm_xenbus();
+       gnttab_free_grant_references(gref_head);
+}
+
+module_exit(tpmif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h
--- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Mar  1 
19:47:25 2006
@@ -23,11 +23,13 @@
        unsigned short pad;
 } __attribute__ ((packed));
 
-extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
+extern struct Xgt_desc_struct idt_descr;
+DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
+
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
-       return ((struct desc_struct *)cpu_gdt_descr[cpu].address);
+       return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
 }
 
 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h        Wed Mar 
 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h        Wed Mar 
 1 19:47:25 2006
@@ -18,8 +18,6 @@
 #define pcibios_assign_all_busses()    0
 #endif
 #define pcibios_scan_all_fns(a, b)     0
-
-extern int no_iommu, force_iommu;
 
 extern unsigned long pci_mem_start;
 #define PCIBIOS_MIN_IO         0x1000
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h    Wed Mar 
 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h    Wed Mar 
 1 19:47:25 2006
@@ -169,7 +169,7 @@
 #define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
 #define PGDIR_MASK     (~(PGDIR_SIZE-1))
 
-#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
 #define FIRST_USER_ADDRESS     0
 
 #ifndef __ASSEMBLY__
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/include/linux/mm.h
--- a/linux-2.6-xen-sparse/include/linux/mm.h   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/linux/mm.h   Wed Mar  1 19:47:25 2006
@@ -1064,7 +1064,11 @@
 void drop_pagecache(void);
 void drop_slab(void);
 
+#ifndef CONFIG_MMU
+#define randomize_va_space 0
+#else
 extern int randomize_va_space;
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/mm/page_alloc.c
--- a/linux-2.6-xen-sparse/mm/page_alloc.c      Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/mm/page_alloc.c      Wed Mar  1 19:47:25 2006
@@ -1017,7 +1017,7 @@
                if (page)
                        goto got_pg;
 
-               out_of_memory(gfp_mask, order);
+               out_of_memory(zonelist, gfp_mask, order);
                goto restart;
        }
 
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/net/core/skbuff.c
--- a/linux-2.6-xen-sparse/net/core/skbuff.c    Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/net/core/skbuff.c    Wed Mar  1 19:47:25 2006
@@ -434,6 +434,9 @@
        C(pkt_type);
        C(ip_summed);
        C(priority);
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+       C(ipvs_property);
+#endif
        C(protocol);
        n->destructor = NULL;
 #ifdef CONFIG_NETFILTER
@@ -441,13 +444,6 @@
        C(nfct);
        nf_conntrack_get(skb->nfct);
        C(nfctinfo);
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-       C(nfct_reasm);
-       nf_conntrack_get_reasm(skb->nfct_reasm);
-#endif
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
-       C(ipvs_property);
-#endif
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        C(nfct_reasm);
        nf_conntrack_get_reasm(skb->nfct_reasm);
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/Makefile
--- a/tools/examples/Makefile   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/Makefile   Wed Mar  1 19:47:25 2006
@@ -26,10 +26,11 @@
 XEN_SCRIPTS += network-nat vif-nat
 XEN_SCRIPTS += block
 XEN_SCRIPTS += block-enbd block-nbd
-XEN_SCRIPTS += vtpm
-XEN_SCRIPT_DATA = xen-script-common.sh
+XEN_SCRIPTS += vtpm vtpm-delete
+XEN_SCRIPTS += xen-hotplug-cleanup
+XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
 XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
-XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh
+XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh
 
 XEN_HOTPLUG_DIR = /etc/hotplug
 XEN_HOTPLUG_SCRIPTS = xen-backend.agent
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vif-common.sh
--- a/tools/examples/vif-common.sh      Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vif-common.sh      Wed Mar  1 19:47:25 2006
@@ -125,7 +125,7 @@
 #
 function ip_of()
 {
-  ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed 's,/.*,,' | head 
-1
+  ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed -n '1 s,/.*,,p'
 }
 
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm
--- a/tools/examples/vtpm       Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vtpm       Wed Mar  1 19:47:25 2006
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 dir=$(dirname "$0")
-. "$dir/vtpm-common.sh"
+. "$dir/vtpm-hotplug-common.sh"
 
 vtpm_fatal_error=0
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-common.sh
--- a/tools/examples/vtpm-common.sh     Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vtpm-common.sh     Wed Mar  1 19:47:25 2006
@@ -17,21 +17,8 @@
 #
 
 dir=$(dirname "$0")
-. "$dir/xen-hotplug-common.sh"
-
-findCommand "$@"
-if [ "$command" != "online" ]  &&
-   [ "$command" != "offline" ] &&
-   [ "$command" != "add" ]     &&
-   [ "$command" != "remove" ]
-then
-       log err "Invalid command: $command"
-       exit 1
-fi
-
-
-XENBUS_PATH="${XENBUS_PATH:?}"
-
+. "$dir/logging.sh"
+. "$dir/locking.sh"
 
 VTPMDB="/etc/xen/vtpm.db"
 
@@ -58,7 +45,11 @@
        function vtpm_resume() {
                true
        }
+       function vtpm_delete() {
+               true
+       }
 fi
+
 
 #Find the instance number for the vtpm given the name of the domain
 # Parameters
@@ -66,7 +57,7 @@
 # Return value
 #  Returns '0' if instance number could not be found, otherwise
 #  it returns the instance number in the variable 'instance'
-function find_instance () {
+function vtpmdb_find_instance () {
        local vmname=$1
        local ret=0
        instance=`cat $VTPMDB |                    \
@@ -80,18 +71,17 @@
                     }                             \
                   }'`
        if [ "$instance" != "" ]; then
-               ret=1
-       fi
-       return $ret
+               ret=$instance
+       fi
+       echo "$ret"
 }
 
 
 # Check whether a particular instance number is still available
-# returns '1' if it is available
-function is_free_instancenum () {
+# returns "0" if it is not available, "1" otherwise.
+function vtpmdb_is_free_instancenum () {
        local instance=$1
        local avail=1
-
        #Allowed instance number range: 1-255
        if [ $instance -eq 0 -o $instance -gt 255 ]; then
                avail=0
@@ -110,13 +100,13 @@
                        fi
                done
        fi
-       return $avail
+       echo "$avail"
 }
 
 
 # Get an available instance number given the database
 # Returns an unused instance number
-function get_free_instancenum () {
+function vtpmdb_get_free_instancenum () {
        local ctr
        local instances
        local don
@@ -145,12 +135,12 @@
                fi
                let ctr=ctr+1
        done
-       let instance=$ctr
+       echo "$ctr"
 }
 
 
 # Add a domain name and instance number to the DB file
-function add_instance () {
+function vtpmdb_add_instance () {
        local vmname=$1
        local inst=$2
 
@@ -159,8 +149,8 @@
                echo "#1st column: domain name" >> $VTPMDB
                echo "#2nd column: TPM instance number" >> $VTPMDB
        fi
-       validate_entry $vmname $inst
-       if [ $? -eq 0 ]; then
+       res=$(vtpmdb_validate_entry $vmname $inst)
+       if [ $res -eq 0 ]; then
                echo "$vmname $inst" >> $VTPMDB
        fi
 }
@@ -168,11 +158,10 @@
 
 #Validate whether an entry is the same as passed to this
 #function
-function validate_entry () {
+function vtpmdb_validate_entry () {
        local rc=0
        local vmname=$1
        local inst=$2
-       local res
 
        res=`cat $VTPMDB |             \
             gawk -vvmname=$vmname     \
@@ -197,13 +186,15 @@
        elif [ "$res" == "2" ]; then
                let rc=2
        fi
-       return $rc
+       echo "$rc"
 }
 
 
 #Remove an entry from the vTPM database given its domain name
-function remove_entry () {
+#and instance number
+function vtpmdb_remove_entry () {
        local vmname=$1
+       local instance=$2
        local VTPMDB_TMP="$VTPMDB".tmp
        `cat $VTPMDB |             \
         gawk -vvmname=$vmname     \
@@ -214,6 +205,7 @@
         '} > $VTPMDB_TMP`
        if [ -e $VTPMDB_TMP ]; then
                mv -f $VTPMDB_TMP $VTPMDB
+               vtpm_delete $instance
        else
                log err "Error creating temporary file '$VTPMDB_TMP'."
        fi
@@ -222,7 +214,7 @@
 
 # Find the reason for the creation of this device:
 # Set global REASON variable to 'resume' or 'create'
-function get_create_reason () {
+function vtpm_get_create_reason () {
        local resume=$(xenstore-read $XENBUS_PATH/resume)
        if [ "$resume" == "True" ]; then
                REASON="resume"
@@ -230,6 +222,7 @@
                REASON="create"
        fi
 }
+
 
 #Create a vTPM instance
 # If no entry in the TPM database is found, the instance is
@@ -237,26 +230,23 @@
 function vtpm_create_instance () {
        local domname=$(xenstore_read "$XENBUS_PATH"/domain)
        local res
-       set +e
-       get_create_reason
+       local instance
+       vtpm_get_create_reason
 
        claim_lock vtpmdb
-
-       find_instance $domname
-       res=$?
-       if [ $res -eq 0 ]; then
+       instance=$(vtpmdb_find_instance $domname)
+       if [ "$instance" == "0" ]; then
                #Try to give the preferred instance to the domain
                instance=$(xenstore_read "$XENBUS_PATH"/pref_instance)
                if [ "$instance" != "" ]; then
-                       is_free_instancenum $instance
-                       res=$?
+                       res=$(vtpmdb_is_free_instancenum $instance)
                        if [ $res -eq 0 ]; then
-                               get_free_instancenum
+                               instance=$(vtpmdb_get_free_instancenum)
                        fi
                else
-                       get_free_instancenum
+                       instance=$(vtpmdb_get_free_instancenum)
                fi
-               add_instance $domname $instance
+               vtpmdb_add_instance $domname $instance
                if [ "$REASON" == "create" ]; then
                        vtpm_create $instance
                elif [ "$REASON" == "resume" ]; then
@@ -279,25 +269,40 @@
                true
        fi
        xenstore_write $XENBUS_PATH/instance $instance
-       set -e
-}
-
-
-#Remove an instance
+}
+
+
+#Remove an instance when a VM is terminating or suspending.
+#Since it is assumed that the VM will appear again, the
+#entry is kept in the VTPMDB file.
 function vtpm_remove_instance () {
        local domname=$(xenstore_read "$XENBUS_PATH"/domain)
-       set +e
-       find_instance $domname
-       res=$?
-       if [ $res -eq 0 ]; then
-               #Something is really wrong with the DB
-               log err "vTPM DB file $VTPMDB has no entry for '$domname'"
-       else
+
+       claim_lock vtpmdb
+
+       instance=$(vtpmdb_find_instance $domname)
+
+       if [ "$instance" != "0" ]; then
                if [ "$REASON" == "suspend" ]; then
                        vtpm_suspend $instance
                fi
        fi
-       set -e
-}
-
-
+
+       release_lock vtpmdb
+}
+
+
+#Remove an entry in the VTPMDB file given the domain's name
+#1st parameter: The name of the domain
+function vtpm_delete_instance () {
+       local rc
+
+       claim_lock vtpmdb
+
+       instance=$(vtpmdb_find_instance $1)
+       if [ "$instance" != "0" ]; then
+               vtpmdb_remove_entry $1 $instance
+       fi
+
+       release_lock vtpmdb
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-backend.agent
--- a/tools/examples/xen-backend.agent  Wed Mar  1 17:01:54 2006
+++ b/tools/examples/xen-backend.agent  Wed Mar  1 19:47:25 2006
@@ -18,12 +18,7 @@
   add)
     ;;
   remove)
-    # remove device frontend store entries
-    xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true
-
-    # remove device backend store entries
-    xenstore-rm -t "$XENBUS_PATH"       || true
-    xenstore-rm -t "error/$XENBUS_PATH" || true
+    /etc/xen/scripts/xen-hotplug-cleanup
     ;;
   online)
     ;;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-backend.rules
--- a/tools/examples/xen-backend.rules  Wed Mar  1 17:01:54 2006
+++ b/tools/examples/xen-backend.rules  Wed Mar  1 19:47:25 2006
@@ -2,6 +2,4 @@
 SUBSYSTEM=="xen-backend", KERNEL=="vtpm*", RUN+="/etc/xen/scripts/vtpm 
$env{ACTION}"
 SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="online", RUN+="$env{script} 
online"
 SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="offline", 
RUN+="$env{script} offline"
-SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/bin/bash -c 
'/usr/bin/xenstore-rm -t $$(/usr/bin/xenstore-read $env{XENBUS_PATH}/frontend)'"
-SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t 
$env{XENBUS_PATH}"
-SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t 
error/$env{XENBUS_PATH}"
+SUBSYSTEM=="xen-backend", ACTION=="remove", 
RUN+="/etc/xen/scripts/xen-hotplug-cleanup"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-hotplug-common.sh
--- a/tools/examples/xen-hotplug-common.sh      Wed Mar  1 17:01:54 2006
+++ b/tools/examples/xen-hotplug-common.sh      Wed Mar  1 19:47:25 2006
@@ -17,19 +17,15 @@
 
 
 dir=$(dirname "$0")
+. "$dir/logging.sh"
 . "$dir/xen-script-common.sh"
+. "$dir/locking.sh"
 
 exec 2>>/var/log/xen-hotplug.log
 
 export PATH="/sbin:/bin:/usr/bin:/usr/sbin:$PATH"
 export LANG="POSIX"
 unset $(set | grep ^LC_ | cut -d= -f1)
-
-log() {
-  local level="$1"
-  shift
-  logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2
-}
 
 fatal() {
   xenstore_write "$XENBUS_PATH"/hotplug-status error
@@ -93,87 +89,4 @@
 }
 
 
-#
-# Serialisation
-#
-
-LOCK_SLEEPTIME=1
-LOCK_SPINNING_RETRIES=5
-LOCK_RETRIES=10
-LOCK_BASEDIR=/var/run/xen-hotplug
-
-
-claim_lock()
-{
-  local lockdir="$LOCK_BASEDIR/$1"
-  mkdir -p "$LOCK_BASEDIR"
-  _claim_lock "$lockdir"
-}
-
-
-release_lock()
-{
-  _release_lock "$LOCK_BASEDIR/$1"
-}
-
-
-_claim_lock()
-{
-  local lockdir="$1"
-  local owner=$(_lock_owner "$lockdir")
-  local retries=0
-
-  while [ $retries -lt $LOCK_RETRIES ]
-  do
-    mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR &&
-      _update_lock_info "$lockdir" && return
-
-    local new_owner=$(_lock_owner "$lockdir")
-    if [ "$new_owner" != "$owner" ]
-    then
-      owner="$new_owner"
-      retries=0
-    fi
-
-    if [ $retries -gt $LOCK_SPINNING_RETRIES ]
-    then
-      sleep $LOCK_SLEEPTIME
-    else
-      sleep 0
-    fi
-    retries=$(($retries + 1))
-  done
-  _steal_lock "$lockdir"
-}
-
-
-_release_lock()
-{
-  trap sigerr ERR
-  rm -rf "$1" 2>/dev/null || true
-}
-
-
-_steal_lock()
-{
-  local lockdir="$1"
-  local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown")
-  log err "Forced to steal lock on $lockdir from $owner!"
-  _release_lock "$lockdir"
-  _claim_lock "$lockdir"
-}
-
-
-_lock_owner()
-{
-  cat "$1/owner" 2>/dev/null || echo "unknown"
-}
-
-
-_update_lock_info()
-{
-  echo "$$: $0" >"$1/owner"
-}
-
-
 log debug "$@" "XENBUS_PATH=$XENBUS_PATH"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/hvmloader/Makefile
--- a/tools/firmware/hvmloader/Makefile Wed Mar  1 17:01:54 2006
+++ b/tools/firmware/hvmloader/Makefile Wed Mar  1 19:47:25 2006
@@ -19,7 +19,7 @@
 #
 
 XEN_ROOT = ../../..
-include $(XEN_ROOT)/tools/Rules.mk
+include $(XEN_ROOT)/Config.mk
 
 # The HVM loader is started in 32-bit mode at the address below:
 LOADADDR = 0x100000
@@ -29,9 +29,13 @@
 
 OBJECTS         = hvmloader.o acpi_madt.o 
 
-CC       = gcc
+# Disable PIE/SSP if GCC supports them. They can break us.
+CFLAGS  += $(call test-gcc-flag,$(CC),-nopie)
+CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector)
+CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector-all)
+
 OBJCOPY  = objcopy
-CFLAGS   = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
+CFLAGS  += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
 CFLAGS  += -m32 -march=i686
 LDFLAGS  = -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,$(LOADADDR)
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/vgabios/Makefile
--- a/tools/firmware/vgabios/Makefile   Wed Mar  1 17:01:54 2006
+++ b/tools/firmware/vgabios/Makefile   Wed Mar  1 19:47:25 2006
@@ -1,6 +1,4 @@
 CC      = gcc
-CFLAGS  = -g -O2 -Wall -Wstrict-prototypes
-LDFLAGS = 
 
 GCC = gcc
 BCC = bcc
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/vmxassist/Makefile
--- a/tools/firmware/vmxassist/Makefile Wed Mar  1 17:01:54 2006
+++ b/tools/firmware/vmxassist/Makefile Wed Mar  1 19:47:25 2006
@@ -19,7 +19,7 @@
 #
 
 XEN_ROOT = ../../..
-include $(XEN_ROOT)/tools/Rules.mk
+include $(XEN_ROOT)/Config.mk
 
 # The emulator code lives in ROM space
 TEXTADDR=0x000D0000
@@ -27,11 +27,14 @@
 DEFINES=-DDEBUG -DTEXTADDR=$(TEXTADDR)
 XENINC=-I$(XEN_ROOT)/tools/libxc
 
-LD       = ld
-CC       = gcc
+# Disable PIE/SSP if GCC supports them. They can break us.
+CFLAGS  += $(call test-gcc-flag,$(CC),-nopie)
+CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector)
+CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector-all)
+
 CPP      = cpp -P
 OBJCOPY  = objcopy -p -O binary -R .note -R .comment -R .bss -S --gap-fill=0
-CFLAGS   = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
+CFLAGS  += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
 CFLAGS  += -m32 -march=i686
 LDFLAGS  = -m elf_i386
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/Makefile
--- a/tools/ioemu/Makefile      Wed Mar  1 17:01:54 2006
+++ b/tools/ioemu/Makefile      Wed Mar  1 19:47:25 2006
@@ -1,6 +1,9 @@
+XEN_ROOT=../..
+include $(XEN_ROOT)/tools/Rules.mk
+
 -include config-host.mak
 
-CFLAGS=-Wall -O2 -g -fno-strict-aliasing 
+CFLAGS+=-Wall -O2 -g -fno-strict-aliasing 
 ifdef CONFIG_DARWIN
 CFLAGS+= -mdynamic-no-pic
 endif
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/hw/ide.c
--- a/tools/ioemu/hw/ide.c      Wed Mar  1 17:01:54 2006
+++ b/tools/ioemu/hw/ide.c      Wed Mar  1 19:47:25 2006
@@ -669,9 +669,6 @@
     }
     if (s->io_buffer_index >= s->io_buffer_size && s->nsector == 0) {
         s->status = READY_STAT | SEEK_STAT;
-        s->bmdma->status &= ~BM_STATUS_DMAING;
-        s->bmdma->status |= BM_STATUS_INT;
-        ide_set_irq(s);
 #ifdef DEBUG_IDE_ATAPI
         printf("dma status=0x%x\n", s->status);
 #endif
@@ -738,9 +735,6 @@
             if (n == 0) {
                 /* end of transfer */
                 s->status = READY_STAT | SEEK_STAT;
-                s->bmdma->status &= ~BM_STATUS_DMAING;
-                s->bmdma->status |= BM_STATUS_INT;
-                ide_set_irq(s);
                 return 0;
             }
             if (n > MAX_MULT_SECTORS)
@@ -987,9 +981,6 @@
     if (s->packet_transfer_size <= 0) {
         s->status = READY_STAT;
         s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | 
ATAPI_INT_REASON_CD;
-        s->bmdma->status &= ~BM_STATUS_DMAING;
-        s->bmdma->status |= BM_STATUS_INT;
-        ide_set_irq(s);
 #ifdef DEBUG_IDE_ATAPI
         printf("dma status=0x%x\n", s->status);
 #endif
@@ -2025,6 +2016,17 @@
     }
 }
 
+static void ide_dma_finish(BMDMAState *bm)
+{
+    IDEState *s = bm->ide_if;
+
+    bm->status &= ~BM_STATUS_DMAING;
+    bm->status |= BM_STATUS_INT;
+    bm->dma_cb = NULL;
+    bm->ide_if = NULL;
+    ide_set_irq(s);
+}
+
 /* XXX: full callback usage to prepare non blocking I/Os support -
    error handling */
 #ifdef DMA_MULTI_THREAD
@@ -2070,9 +2072,8 @@
         cur_addr += 8;
     }
     /* end of transfer */
- the_end:
-    bm->dma_cb = NULL;
-    bm->ide_if = NULL;
+the_end:
+    ide_dma_finish(bm);
 }
 
 static void ide_dma_start(IDEState *s, IDEDMAFunc *dma_cb)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/hw/pcnet.c
--- a/tools/ioemu/hw/pcnet.c    Wed Mar  1 17:01:54 2006
+++ b/tools/ioemu/hw/pcnet.c    Wed Mar  1 19:47:25 2006
@@ -376,6 +376,10 @@
     if (s->recv_pos > 0)
         return 0;
 
+    pcnet_rdte_poll(s);
+    if (!(CSR_CRST(s) & 0x8000)) {
+        return 0;
+    }
     return sizeof(s->buffer)-16;
 }
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/target-i386-dm/Makefile
--- a/tools/ioemu/target-i386-dm/Makefile       Wed Mar  1 17:01:54 2006
+++ b/tools/ioemu/target-i386-dm/Makefile       Wed Mar  1 19:47:25 2006
@@ -1,7 +1,8 @@
+include config.mak
+override TARGET_ARCH=i386
+
 XEN_ROOT=../../..
 include $(XEN_ROOT)/tools/Rules.mk
-include config.mak
-override TARGET_ARCH=i386
 
 INSTALL_DIR := $(DESTDIR)/usr/$(LIBDIR)/xen/bin
 TARGET_PATH=$(SRC_PATH)/target-$(TARGET_ARCH)
@@ -12,7 +13,7 @@
 VPATH+=:$(SRC_PATH)/linux-user
 DEFINES+=-I$(SRC_PATH)/linux-user -I$(SRC_PATH)/linux-user/$(TARGET_ARCH)
 endif
-CFLAGS=-Wall -O2 -g -fno-strict-aliasing
+CFLAGS+=-Wall -O2 -g -fno-strict-aliasing
 LDFLAGS=-g
 LIBS=
 HELPER_CFLAGS=$(CFLAGS)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c      Wed Mar  1 17:01:54 2006
+++ b/tools/libxc/xc_linux_build.c      Wed Mar  1 19:47:25 2006
@@ -45,6 +45,77 @@
 #ifdef __ia64__
 #define probe_aout9(image,image_size,load_funcs) 1
 #endif
+
+static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = {
+    [XENFEAT_writable_page_tables]       = "writable_page_tables",
+    [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
+    [XENFEAT_auto_translated_physmap]    = "auto_translated_physmap",
+    [XENFEAT_supervisor_mode_kernel]     = "supervisor_mode_kernel",
+    [XENFEAT_pae_pgdir_above_4gb]        = "pae_pgdir_above_4gb"
+};
+
+static inline void set_feature_bit (int nr, uint32_t *addr)
+{
+    addr[nr>>5] |= (1<<(nr&31));
+}
+
+static inline int test_feature_bit(int nr, uint32_t *addr)
+{
+    return !!(addr[nr>>5] & (1<<(nr&31)));
+}
+
+static int parse_features(
+    const char *feats,
+    uint32_t supported[XENFEAT_NR_SUBMAPS],
+    uint32_t required[XENFEAT_NR_SUBMAPS])
+{
+    const char *end, *p;
+    int i, req;
+
+    if ( (end = strchr(feats, ',')) == NULL )
+        end = feats + strlen(feats);
+
+    while ( feats < end )
+    {
+        p = strchr(feats, '|');
+        if ( (p == NULL) || (p > end) )
+            p = end;
+
+        req = (*feats == '!');
+        if ( req )
+            feats++;
+
+        for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ )
+        {
+            if ( feature_names[i] == NULL )
+                continue;
+
+            if ( strncmp(feature_names[i], feats, p-feats) == 0 )
+            {
+                set_feature_bit(i, supported);
+                if ( required && req )
+                    set_feature_bit(i, required);
+                break;
+            }
+        }
+
+        if ( i == XENFEAT_NR_SUBMAPS*32 )
+        {
+            ERROR("Unknown feature \"%.*s\".\n", (int)(p-feats), feats);
+            if ( req )
+            {
+                ERROR("Kernel requires an unknown hypervisor feature.\n");
+                return -EINVAL;
+            }
+        }
+
+        feats = p;
+        if ( *feats == '|' )
+            feats++;
+    }
+
+    return -EINVAL;
+}
 
 static int probeimageformat(char *image,
                             unsigned long image_size,
@@ -344,7 +415,8 @@
                        unsigned long shared_info_frame,
                        unsigned long flags,
                        unsigned int store_evtchn, unsigned long *store_mfn,
-                       unsigned int console_evtchn, unsigned long *console_mfn)
+                       unsigned int console_evtchn, unsigned long *console_mfn,
+                       uint32_t required_features[XENFEAT_NR_SUBMAPS])
 {
     unsigned long *page_array = NULL;
     struct load_funcs load_funcs;
@@ -483,7 +555,8 @@
                        unsigned long shared_info_frame,
                        unsigned long flags,
                        unsigned int store_evtchn, unsigned long *store_mfn,
-                       unsigned int console_evtchn, unsigned long *console_mfn)
+                       unsigned int console_evtchn, unsigned long *console_mfn,
+                       uint32_t required_features[XENFEAT_NR_SUBMAPS])
 {
     unsigned long *page_array = NULL;
     unsigned long count, i, hypercall_pfn;
@@ -515,8 +588,9 @@
     unsigned long vpt_start;
     unsigned long vpt_end;
     unsigned long v_end;
-    unsigned shadow_mode_enabled;
     unsigned long guest_store_mfn, guest_console_mfn, guest_shared_info_mfn;
+    unsigned long shadow_mode_enabled;
+    uint32_t supported_features[XENFEAT_NR_SUBMAPS] = { 0, };
 
     rc = probeimageformat(image, image_size, &load_funcs);
     if ( rc != 0 )
@@ -534,8 +608,6 @@
         goto error_out;
     }
 
-    shadow_mode_enabled = !!strstr(dsi.xen_guest_string,
-                                   "SHADOW=translate");
     /*
      * Why do we need this? The number of page-table frames depends on the 
      * size of the bootstrap address space. But the size of the address space 
@@ -637,6 +709,35 @@
     (load_funcs.loadimage)(image, image_size, xc_handle, dom, page_array,
                            &dsi);
 
+    /* Parse and validate kernel features. */
+    p = strstr(dsi.xen_guest_string, "FEATURES=");
+    if ( p != NULL )
+    {
+        if ( !parse_features(p + strlen("FEATURES="),
+                             supported_features,
+                             required_features) )
+        {
+            ERROR("Failed to parse guest kernel features.\n");
+            goto error_out;
+        }
+
+        fprintf(stderr, "Supported features  = { %08x }.\n",
+                supported_features[0]);
+        fprintf(stderr, "Required features   = { %08x }.\n",
+                required_features[0]);
+    }
+
+    for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ )
+    {
+        if ( (supported_features[i]&required_features[i]) != 
required_features[i] )
+        {
+            ERROR("Guest kernel does not support a required feature.\n");
+            goto error_out;
+        }
+    }
+
+    shadow_mode_enabled = test_feature_bit(XENFEAT_auto_translated_physmap, 
required_features);
+
     /* Load the initial ramdisk image. */
     if ( initrd_len != 0 )
     {
@@ -870,6 +971,7 @@
                    const char *image_name,
                    const char *ramdisk_name,
                    const char *cmdline,
+                   const char *features,
                    unsigned long flags,
                    unsigned int store_evtchn,
                    unsigned long *store_mfn,
@@ -886,6 +988,16 @@
     char         *image = NULL;
     unsigned long image_size, initrd_size=0;
     unsigned long vstartinfo_start, vkern_entry, vstack_start;
+    uint32_t      features_bitmap[XENFEAT_NR_SUBMAPS] = { 0, };
+
+    if ( features != NULL )
+    {
+        if ( !parse_features(features, features_bitmap, NULL) )
+        {
+            PERROR("Failed to parse configured features\n");
+            goto error_out;
+        }
+    }
 
     if ( (nr_pages = get_tot_pages(xc_handle, domid)) < 0 )
     {
@@ -940,7 +1052,8 @@
                      &vstack_start, ctxt, cmdline,
                      op.u.getdomaininfo.shared_info_frame,
                      flags, store_evtchn, store_mfn,
-                     console_evtchn, console_mfn) < 0 )
+                     console_evtchn, console_mfn,
+                     features_bitmap) < 0 )
     {
         ERROR("Error constructing guest OS");
         goto error_out;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Wed Mar  1 17:01:54 2006
+++ b/tools/libxc/xenguest.h    Wed Mar  1 19:47:25 2006
@@ -47,6 +47,7 @@
                    const char *image_name,
                    const char *ramdisk_name,
                    const char *cmdline,
+                   const char *features,
                    unsigned long flags,
                    unsigned int store_evtchn,
                    unsigned long *store_mfn,
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/pygrub/src/pygrub
--- a/tools/pygrub/src/pygrub   Wed Mar  1 17:01:54 2006
+++ b/tools/pygrub/src/pygrub   Wed Mar  1 19:47:25 2006
@@ -94,11 +94,17 @@
             return struct.unpack("<L", buf[poff+8:poff+12])[0] * SECTOR_SIZE
     return -1
 
-def get_config(fn):
+def get_config(fn, isconfig = False):
     if not os.access(fn, os.R_OK):
         raise RuntimeError, "Unable to access %s" %(fn,)
 
     cf = grub.GrubConf.GrubConfigFile()
+
+    if isconfig:
+        # set the config file and parse it
+        cf.filename = fn
+        cf.parse()
+        return cf
 
     offset = 0
     if is_disk_image(fn):
@@ -130,9 +136,7 @@
         # then parse the grub config
         cf.parse(buf)
     else:
-        # set the config file and parse it
-        cf.filename = fn
-        cf.parse()
+        raise RuntimeError, "Unable to read filesystem" 
     
     return cf
 
@@ -214,7 +218,8 @@
 
     try:
         opts, args = getopt.gnu_getopt(sys.argv[1:], 'qh::',
-                                   ["quiet", "help", "output=", "entry="])
+                                   ["quiet", "help", "output=", "entry=",
+                                    "isconfig"])
     except getopt.GetoptError:
         usage()
         sys.exit(1)
@@ -227,6 +232,7 @@
     output = None
     entry = None
     interactive = True
+    isconfig = False
     for o, a in opts:
         if o in ("-q", "--quiet"):
             interactive = False
@@ -239,13 +245,15 @@
             entry = a
             # specifying the entry to boot implies non-interactive
             interactive = False
+        elif o in ("--isconfig",):
+            isconfig = True
 
     if output is None or output == "-":
         fd = sys.stdout.fileno()
     else:
         fd = os.open(output, os.O_WRONLY)
 
-    cf = get_config(file)
+    cf = get_config(file, isconfig)
     if interactive:
         curses.wrapper(run_main)
     else:
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Mar  1 19:47:25 2006
@@ -326,27 +326,29 @@
                                   PyObject *kwds)
 {
     uint32_t dom;
-    char *image, *ramdisk = NULL, *cmdline = "";
+    char *image, *ramdisk = NULL, *cmdline = "", *features = NULL;
     int flags = 0;
     int store_evtchn, console_evtchn;
     unsigned long store_mfn = 0;
     unsigned long console_mfn = 0;
 
-    static char *kwd_list[] = { "dom", "store_evtchn", 
-                                "console_evtchn", "image", 
+    static char *kwd_list[] = { "dom", "store_evtchn",
+                                "console_evtchn", "image",
                                /* optional */
-                               "ramdisk", "cmdline", "flags", NULL };
-
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssi", kwd_list,
+                               "ramdisk", "cmdline", "flags",
+                               "features", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssis", kwd_list,
                                       &dom, &store_evtchn,
-                                     &console_evtchn, &image, 
+                                     &console_evtchn, &image,
                                      /* optional */
-                                     &ramdisk, &cmdline, &flags) )
+                                     &ramdisk, &cmdline, &flags,
+                                     &features) )
         return NULL;
 
     if ( xc_linux_build(self->xc_handle, dom, image,
-                        ramdisk, cmdline, flags,
-                        store_evtchn, &store_mfn, 
+                        ramdisk, cmdline, features, flags,
+                        store_evtchn, &store_mfn,
                        console_evtchn, &console_mfn) != 0 ) {
         if (!errno)
              errno = EINVAL;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/XendBootloader.py
--- a/tools/python/xen/xend/XendBootloader.py   Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xend/XendBootloader.py   Wed Mar  1 19:47:25 2006
@@ -1,7 +1,7 @@
 #
 # XendBootloader.py - Framework to run a boot loader for picking the kernel
 #
-# Copyright 2005 Red Hat, Inc.
+# Copyright 2005-2006 Red Hat, Inc.
 # Jeremy Katz <katzj@xxxxxxxxxx>
 #
 # This software may be freely redistributed under the terms of the GNU
@@ -13,12 +13,11 @@
 #
 
 import os, select, errno
+import random
 import sxp
 
 from XendLogging import log
 from XendError import VmError
-
-BL_FIFO = "/var/lib/xen/xenbl"
 
 def bootloader(blexec, disk, quiet = 0, vcpus = None, entry = None):
     """Run the boot loader executable on the given disk and return a
@@ -38,14 +37,18 @@
         log.error(msg)
         raise VmError(msg)
 
-    os.mkfifo(BL_FIFO, 0600)
+    while True:
+        fifo = "/var/lib/xen/xenbl.%s" %(random.randint(0, 32000),)
+        if not os.path.exists(fifo):
+            break
+    os.mkfifo(fifo, 0600)
 
     child = os.fork()
     if (not child):
         args = [ blexec ]
         if quiet:
             args.append("-q")
-        args.append("--output=%s" %(BL_FIFO,))
+        args.append("--output=%s" %(fifo,))
         if entry is not None:
             args.append("--entry=%s" %(entry,))
         args.append(disk)
@@ -59,7 +62,7 @@
 
     while 1:
         try:
-            r = os.open(BL_FIFO, os.O_RDONLY)
+            r = os.open(fifo, os.O_RDONLY)
         except OSError, e:
             if e.errno == errno.EINTR:
                 continue
@@ -74,7 +77,7 @@
         
     os.waitpid(child, 0)
     os.close(r)
-    os.unlink(BL_FIFO)
+    os.unlink(fifo)
 
     if len(ret) == 0:
         msg = "Boot loader didn't return any data!"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xend/XendDomainInfo.py   Wed Mar  1 19:47:25 2006
@@ -1502,15 +1502,14 @@
         if not self.info['bootloader']:
             return
         # if we're restarting with a bootloader, we need to run it
-        # FIXME: this assumes the disk is the first device and
-        # that we're booting from the first disk
         blcfg = None
         config = self.sxpr()
         # FIXME: this assumes that we want to use the first disk
-        dev = sxp.child_value(config, "device")
-        if dev:
-            disk = sxp.child_value(dev, "uname")
-            fn = blkdev_uname_to_file(disk)
+        for dev in sxp.children(config, "device"):
+            disk = sxp.child(dev, "vbd")
+            if disk is None:
+                continue
+            fn = blkdev_uname_to_file(sxp.child_value(disk, "uname"))
             blcfg = bootloader(self.info['bootloader'], fn, 1,
                                self.info['vcpus'])
         if blcfg is None:
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xend/image.py    Wed Mar  1 19:47:25 2006
@@ -68,6 +68,7 @@
         self.kernel = None
         self.ramdisk = None
         self.cmdline = None
+        self.features = None
 
         self.configure(imageConfig, deviceConfig)
 
@@ -89,6 +90,7 @@
         if args:
             self.cmdline += " " + args
         self.ramdisk = get_cfg("ramdisk", '')
+        self.features = get_cfg("features", '')
         
         self.vm.storeVm(("image/ostype", self.ostype),
                         ("image/kernel", self.kernel),
@@ -175,13 +177,15 @@
         log.debug("cmdline        = %s", self.cmdline)
         log.debug("ramdisk        = %s", self.ramdisk)
         log.debug("vcpus          = %d", self.vm.getVCpuCount())
+        log.debug("features       = %s", self.features)
 
         return xc.linux_build(dom            = self.vm.getDomid(),
                               image          = self.kernel,
                               store_evtchn   = store_evtchn,
                               console_evtchn = console_evtchn,
                               cmdline        = self.cmdline,
-                              ramdisk        = self.ramdisk)
+                              ramdisk        = self.ramdisk,
+                              features       = self.features)
 
 class HVMImageHandler(ImageHandler):
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/server/netif.py
--- a/tools/python/xen/xend/server/netif.py     Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xend/server/netif.py     Wed Mar  1 19:47:25 2006
@@ -113,7 +113,8 @@
                            script.replace(xroot.network_script_dir + os.sep,
                                           "")])
         if ip:
-            result.append(['ip', ip.split(" ")])
+            for i in ip.split(" "):
+                result.append(['ip', i])
         if bridge:
             result.append(['bridge', bridge])
         if mac:
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xm/create.py     Wed Mar  1 19:47:25 2006
@@ -137,6 +137,10 @@
           fn=set_value, default='',
           use="Path to ramdisk.")
 
+gopts.var('features', val='FEATURES',
+          fn=set_value, default='',
+          use="Features to enable in guest kernel")
+
 gopts.var('builder', val='FUNCTION',
           fn=set_value, default='linux',
           use="Function to use to build the domain.")
@@ -445,6 +449,8 @@
         config_image.append(['root', cmdline_root])
     if vals.extra:
         config_image.append(['args', vals.extra])
+    if vals.features:
+        config_image.append(['features', vals.features])
 
     if vals.builder == 'hvm':
         configure_hvm(config_image, vals)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/tests/Makefile
--- a/tools/tests/Makefile      Wed Mar  1 17:01:54 2006
+++ b/tools/tests/Makefile      Wed Mar  1 19:47:25 2006
@@ -4,13 +4,12 @@
 
 TARGET := test_x86_emulator
 
-CC     := gcc
-CFLAGS := -O2 -Wall -Werror -D__TEST_HARNESS__
+HOSTCFLAGS += -D__TEST_HARNESS__
 
 all: $(TARGET)
 
 $(TARGET): x86_emulate.o test_x86_emulator.o
-       $(CC) -o $@ $^
+       $(HOSTCC) -o $@ $^
 
 clean:
        rm -rf $(TARGET) *.o *~ core
@@ -18,7 +17,7 @@
 install:
 
 x86_emulate.o: $(XEN_ROOT)/xen/arch/x86/x86_emulate.c
-       $(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
+       $(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
 
 %.o: %.c
-       $(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
+       $(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xenstore/xs.c
--- a/tools/xenstore/xs.c       Wed Mar  1 17:01:54 2006
+++ b/tools/xenstore/xs.c       Wed Mar  1 19:47:25 2006
@@ -31,7 +31,6 @@
 #include <signal.h>
 #include <stdint.h>
 #include <errno.h>
-#include <sys/ioctl.h>
 #include <pthread.h>
 #include "xs.h"
 #include "list.h"
@@ -343,7 +342,6 @@
                free(ret);
                saved_errno = EBADF;
                goto close_fd;
-               
        }
        return ret;
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/configure.ac
--- a/tools/xm-test/configure.ac        Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/configure.ac        Wed Mar  1 19:47:25 2006
@@ -93,6 +93,7 @@
     tests/unpause/Makefile
     tests/vcpu-pin/Makefile
     tests/vcpu-disable/Makefile
+    tests/vtpm/Makefile
     tests/enforce_dom0_cpus/Makefile
     lib/XmTestReport/xmtest.py
     lib/XmTestLib/config.py
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/lib/XmTestLib/Network.py
--- a/tools/xm-test/lib/XmTestLib/Network.py    Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/lib/XmTestLib/Network.py    Wed Mar  1 19:47:25 2006
@@ -22,6 +22,7 @@
 import sys;
 import os;
 import atexit;
+import random;
 
 from Test import *
 from Xm import *
@@ -53,12 +54,22 @@
         if rc == 0:
             SKIP("Zeroconf address found: " + out)
 
+        # Randomize one octet of the IP addresses we choose, so that
+        # multiple machines running network tests don't interfere 
+        # with each other. 
+        self.subnet = random.randint(1,254)
+
     def calc_ip_address(self, dom, interface):
         # Generate an IP address from the dom# and eth#:
-        #      169.254.(eth#+153).(dom#+10)
+        #      169.254.(self.subnet).(eth#)*16 + (dom# + 1)
         ethnum = int(interface[len("eth"):])
+        if (ethnum > 15):
+            raise NetworkError("ethnum > 15 : " + interface)
         domnum = int(dom[len("dom"):])
-        return "169.254."+ str(ethnum+153) + "." + str(domnum+10)
+        if (domnum > 14):
+            raise NetworkError("domnum > 14 : " + dom)
+
+        return "169.254."+ str(self.subnet) + "." + str(ethnum*16+domnum+1)
 
     def ip(self, dom, interface, todomname=None, toeth=None, bridge=None):
         newip = self.calc_ip_address(dom, interface)
@@ -96,4 +107,4 @@
         return newip
 
     def mask(self, dom, interface):
-        return "255.255.255.0"
+        return "255.255.255.240"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/lib/XmTestLib/XenDomain.py
--- a/tools/xm-test/lib/XmTestLib/XenDomain.py  Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/lib/XmTestLib/XenDomain.py  Wed Mar  1 19:47:25 2006
@@ -99,6 +99,7 @@
         # These options need to be lists
         self.defaultOpts["disk"] = []
         self.defaultOpts["vif"]  = []
+        self.defaultOpts["vtpm"] = []
 
         self.opts = self.defaultOpts
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/Makefile.am
--- a/tools/xm-test/tests/Makefile.am   Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/tests/Makefile.am   Wed Mar  1 19:47:25 2006
@@ -23,6 +23,7 @@
                unpause         \
                vcpu-disable    \
                vcpu-pin        \
+               vtpm            \
                enforce_dom0_cpus       \
                save restore migrate
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/Rules.mk
--- a/xen/Rules.mk      Wed Mar  1 17:01:54 2006
+++ b/xen/Rules.mk      Wed Mar  1 19:47:25 2006
@@ -45,7 +45,7 @@
 
 include $(BASEDIR)/arch/$(TARGET_ARCH)/Rules.mk
 
-CFLAGS += -g
+CFLAGS += -g -D__XEN__
 
 ifneq ($(debug),y)
 CFLAGS += -DNDEBUG
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/ia64/vmx/vmx_hypercall.c
--- a/xen/arch/ia64/vmx/vmx_hypercall.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/ia64/vmx/vmx_hypercall.c Wed Mar  1 19:47:25 2006
@@ -57,45 +57,7 @@
     vcpu_set_gr(vcpu, 8, ret, 0);
     vmx_vcpu_increment_iip(vcpu);
 }
-/* turn off temporarily, we will merge hypercall parameter convention with 
xeno, when
-    VTI domain need to call hypercall */
-#if 0
-unsigned long __hypercall_create_continuation(
-    unsigned int op, unsigned int nr_args, ...)
-{
-    struct mc_state *mcs = &mc_state[smp_processor_id()];
-    VCPU *vcpu = current;
-    struct cpu_user_regs *regs = vcpu_regs(vcpu);
-    unsigned int i;
-    va_list args;
-
-    va_start(args, nr_args);
-    if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) {
-       panic("PREEMPT happen in multicall\n"); // Not support yet
-    } else {
-       vcpu_set_gr(vcpu, 15, op, 0);
-       for ( i = 0; i < nr_args; i++) {
-           switch (i) {
-           case 0: vcpu_set_gr(vcpu, 16, va_arg(args, unsigned long), 0);
-                   break;
-           case 1: vcpu_set_gr(vcpu, 17, va_arg(args, unsigned long), 0);
-                   break;
-           case 2: vcpu_set_gr(vcpu, 18, va_arg(args, unsigned long), 0);
-                   break;
-           case 3: vcpu_set_gr(vcpu, 19, va_arg(args, unsigned long), 0);
-                   break;
-           case 4: vcpu_set_gr(vcpu, 20, va_arg(args, unsigned long), 0);
-                   break;
-           default: panic("Too many args for hypercall continuation\n");
-                   break;
-           }
-       }
-    }
-    vcpu->arch.hypercall_continuation = 1;
-    va_end(args);
-    return op;
-}
-#endif
+
 void hyper_dom_mem_op(void)
 {
     VCPU *vcpu=current;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/ia64/xen/process.c
--- a/xen/arch/ia64/xen/process.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/ia64/xen/process.c       Wed Mar  1 19:47:25 2006
@@ -801,30 +801,48 @@
        reflect_interruption(isr,regs,vector);
 }
 
-unsigned long __hypercall_create_continuation(
-       unsigned int op, unsigned int nr_args, ...)
+unsigned long hypercall_create_continuation(
+       unsigned int op, const char *format, ...)
 {
     struct mc_state *mcs = &mc_state[smp_processor_id()];
     struct vcpu *v = current;
+    const char *p = format;
+    unsigned long arg;
     unsigned int i;
     va_list args;
 
-    va_start(args, nr_args);
+    va_start(args, format);
     if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) {
        panic("PREEMPT happen in multicall\n"); // Not support yet
     } else {
        vcpu_set_gr(v, 2, op, 0);
-       for ( i = 0; i < nr_args; i++) {
+       for ( i = 0; *p != '\0'; i++) {
+            switch ( *p++ )
+            {
+            case 'i':
+                arg = (unsigned long)va_arg(args, unsigned int);
+                break;
+            case 'l':
+                arg = (unsigned long)va_arg(args, unsigned long);
+                break;
+            case 'p':
+            case 'h':
+                arg = (unsigned long)va_arg(args, void *);
+                break;
+            default:
+                arg = 0;
+                BUG();
+            }
            switch (i) {
-           case 0: vcpu_set_gr(v, 14, va_arg(args, unsigned long), 0);
+           case 0: vcpu_set_gr(v, 14, arg, 0);
                    break;
-           case 1: vcpu_set_gr(v, 15, va_arg(args, unsigned long), 0);
+           case 1: vcpu_set_gr(v, 15, arg, 0);
                    break;
-           case 2: vcpu_set_gr(v, 16, va_arg(args, unsigned long), 0);
+           case 2: vcpu_set_gr(v, 16, arg, 0);
                    break;
-           case 3: vcpu_set_gr(v, 17, va_arg(args, unsigned long), 0);
+           case 3: vcpu_set_gr(v, 17, arg, 0);
                    break;
-           case 4: vcpu_set_gr(v, 18, va_arg(args, unsigned long), 0);
+           case 4: vcpu_set_gr(v, 18, arg, 0);
                    break;
            default: panic("Too many args for hypercall continuation\n");
                    break;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/Makefile     Wed Mar  1 19:47:25 2006
@@ -33,6 +33,10 @@
  endif
 endif
 
+ifneq ($(supervisor_mode_kernel),y)
+OBJS := $(subst x86_32/supervisor_mode_kernel.o,,$(OBJS))
+endif
+
 OBJS := $(subst $(TARGET_SUBARCH)/asm-offsets.o,,$(OBJS))
 OBJS := $(subst $(TARGET_SUBARCH)/xen.lds.o,,$(OBJS))
 
@@ -44,7 +48,7 @@
 
 $(TARGET): $(TARGET)-syms boot/mkelf32
        ./boot/mkelf32 $(TARGET)-syms $(TARGET) 0x100000 \
-       `nm $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^ ]*\).*/0x\1/'`
+       `$(NM) $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^ 
]*\).*/0x\1/'`
 
 $(CURDIR)/arch.o: $(OBJS)
        $(LD) $(LDFLAGS) -r -o $@ $(OBJS)
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/Rules.mk
--- a/xen/arch/x86/Rules.mk     Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/Rules.mk     Wed Mar  1 19:47:25 2006
@@ -6,6 +6,7 @@
 # 'make clean' before rebuilding.
 #
 pae ?= n
+supervisor_mode_kernel ?= n
 
 CFLAGS  += -nostdinc -fno-builtin -fno-common -fno-strict-aliasing
 CFLAGS  += -iwithprefix include -Wall -Werror -Wno-pointer-arith -pipe
@@ -32,6 +33,9 @@
 CFLAGS  += -DCONFIG_X86_PAE=1
 endif
 endif
+ifeq ($(supervisor_mode_kernel),y)
+CFLAGS  += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1
+endif
 
 ifeq ($(TARGET_SUBARCH),x86_64)
 CFLAGS  += -m64 -mno-red-zone -fpic -fno-reorder-blocks
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/boot/mkelf32.c
--- a/xen/arch/x86/boot/mkelf32.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/boot/mkelf32.c       Wed Mar  1 19:47:25 2006
@@ -244,7 +244,7 @@
 
     inimage  = argv[1];
     outimage = argv[2];
-    loadbase = strtoul(argv[3], NULL, 16);
+    loadbase = strtoull(argv[3], NULL, 16);
     final_exec_addr = strtoul(argv[4], NULL, 16);
 
     infd = open(inimage, O_RDONLY);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c   Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/dom0_ops.c   Wed Mar  1 19:47:25 2006
@@ -181,10 +181,13 @@
     {
         dom0_physinfo_t *pi = &op->u.physinfo;
 
-        pi->threads_per_core = smp_num_siblings;
-        pi->cores_per_socket = boot_cpu_data.x86_max_cores;
+        pi->threads_per_core =
+            cpus_weight(cpu_sibling_map[0]);
+        pi->cores_per_socket =
+            cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
         pi->sockets_per_node = 
-            num_online_cpus() / (pi->threads_per_core * pi->cores_per_socket);
+            num_online_cpus() / cpus_weight(cpu_core_map[0]);
+
         pi->nr_nodes         = 1;
         pi->total_pages      = total_pages;
         pi->free_pages       = avail_domheap_pages();
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/domain.c     Wed Mar  1 19:47:25 2006
@@ -351,17 +351,17 @@
 
     if ( !(c->flags & VGCF_HVM_GUEST) )
     {
-        fixup_guest_selector(c->user_regs.ss);
-        fixup_guest_selector(c->kernel_ss);
-        fixup_guest_selector(c->user_regs.cs);
+        fixup_guest_stack_selector(c->user_regs.ss);
+        fixup_guest_stack_selector(c->kernel_ss);
+        fixup_guest_code_selector(c->user_regs.cs);
 
 #ifdef __i386__
-        fixup_guest_selector(c->event_callback_cs);
-        fixup_guest_selector(c->failsafe_callback_cs);
+        fixup_guest_code_selector(c->event_callback_cs);
+        fixup_guest_code_selector(c->failsafe_callback_cs);
 #endif
 
         for ( i = 0; i < 256; i++ )
-            fixup_guest_selector(c->trap_ctxt[i].cs);
+            fixup_guest_code_selector(c->trap_ctxt[i].cs);
     }
     else if ( !hvm_enabled )
       return -EINVAL;
@@ -784,6 +784,11 @@
 
     context_saved(prev);
 
+    /* Update per-VCPU guest runstate shared memory area (if registered). */
+    if ( next->runstate_guest != NULL )
+        __copy_to_user(next->runstate_guest, &next->runstate,
+                       sizeof(next->runstate));
+
     schedule_tail(next);
     BUG();
 }
@@ -820,56 +825,77 @@
     flush_tlb_mask(v->vcpu_dirty_cpumask);
 }
 
-unsigned long __hypercall_create_continuation(
-    unsigned int op, unsigned int nr_args, ...)
+#define next_arg(fmt, args) ({                                              \
+    unsigned long __arg;                                                    \
+    switch ( *(fmt)++ )                                                     \
+    {                                                                       \
+    case 'i': __arg = (unsigned long)va_arg(args, unsigned int);  break;    \
+    case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break;    \
+    case 'p': __arg = (unsigned long)va_arg(args, void *);        break;    \
+    case 'h': __arg = (unsigned long)va_arg(args, void *);        break;    \
+    default:  __arg = 0; BUG();                                             \
+    }                                                                       \
+    __arg;                                                                  \
+})
+
+unsigned long hypercall_create_continuation(
+    unsigned int op, const char *format, ...)
 {
     struct mc_state *mcs = &mc_state[smp_processor_id()];
     struct cpu_user_regs *regs;
+    const char *p = format;
+    unsigned long arg;
     unsigned int i;
     va_list args;
 
-    va_start(args, nr_args);
+    va_start(args, format);
 
     if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
     {
         __set_bit(_MCSF_call_preempted, &mcs->flags);
 
-        for ( i = 0; i < nr_args; i++ )
-            mcs->call.args[i] = va_arg(args, unsigned long);
+        for ( i = 0; *p != '\0'; i++ )
+            mcs->call.args[i] = next_arg(p, args);
     }
     else
     {
         regs       = guest_cpu_user_regs();
 #if defined(__i386__)
         regs->eax  = op;
-        regs->eip -= 2;  /* re-execute 'int 0x82' */
-
-        for ( i = 0; i < nr_args; i++ )
-        {
+
+        if ( supervisor_mode_kernel )
+            regs->eip &= ~31; /* re-execute entire hypercall entry stub */
+        else
+            regs->eip -= 2;   /* re-execute 'int 0x82' */
+
+        for ( i = 0; *p != '\0'; i++ )
+        {
+            arg = next_arg(p, args);
             switch ( i )
             {
-            case 0: regs->ebx = va_arg(args, unsigned long); break;
-            case 1: regs->ecx = va_arg(args, unsigned long); break;
-            case 2: regs->edx = va_arg(args, unsigned long); break;
-            case 3: regs->esi = va_arg(args, unsigned long); break;
-            case 4: regs->edi = va_arg(args, unsigned long); break;
-            case 5: regs->ebp = va_arg(args, unsigned long); break;
+            case 0: regs->ebx = arg; break;
+            case 1: regs->ecx = arg; break;
+            case 2: regs->edx = arg; break;
+            case 3: regs->esi = arg; break;
+            case 4: regs->edi = arg; break;
+            case 5: regs->ebp = arg; break;
             }
         }
 #elif defined(__x86_64__)
         regs->rax  = op;
         regs->rip -= 2;  /* re-execute 'syscall' */
 
-        for ( i = 0; i < nr_args; i++ )
-        {
+        for ( i = 0; *p != '\0'; i++ )
+        {
+            arg = next_arg(p, args);
             switch ( i )
             {
-            case 0: regs->rdi = va_arg(args, unsigned long); break;
-            case 1: regs->rsi = va_arg(args, unsigned long); break;
-            case 2: regs->rdx = va_arg(args, unsigned long); break;
-            case 3: regs->r10 = va_arg(args, unsigned long); break;
-            case 4: regs->r8  = va_arg(args, unsigned long); break;
-            case 5: regs->r9  = va_arg(args, unsigned long); break;
+            case 0: regs->rdi = arg; break;
+            case 1: regs->rsi = arg; break;
+            case 2: regs->rdx = arg; break;
+            case 3: regs->r10 = arg; break;
+            case 4: regs->r8  = arg; break;
+            case 5: regs->r9  = arg; break;
             }
         }
 #endif
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/domain_build.c       Wed Mar  1 19:47:25 2006
@@ -27,6 +27,9 @@
 #include <asm/shadow.h>
 
 #include <public/version.h>
+
+extern unsigned long initial_images_nrpages(void);
+extern void discard_initial_images(void);
 
 static long dom0_nrpages;
 
@@ -181,7 +184,8 @@
         {
             printk("Unknown kernel feature \"%.*s\".\n",
                    (int)(p-feats), feats);
-            panic("Domain 0 requires an unknown hypervisor feature.\n");
+            if ( req )
+                panic("Domain 0 requires an unknown hypervisor feature.\n");
         }
 
         feats = p;
@@ -248,9 +252,6 @@
     uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
     uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
 
-    extern void translate_l2pgtable(
-        struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
-
     /* Sanity! */
     BUG_ON(d->domain_id != 0);
     BUG_ON(d->vcpu[0] == NULL);
@@ -271,18 +272,14 @@
      */
     if ( dom0_nrpages == 0 )
     {
-        dom0_nrpages = avail_domheap_pages() +
-            ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
-            ((image_len  + PAGE_SIZE - 1) >> PAGE_SHIFT);
+        dom0_nrpages = avail_domheap_pages() + initial_images_nrpages();
         dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
         dom0_nrpages = -dom0_nrpages;
     }
 
     /* Negative memory specification means "all memory - specified amount". */
     if ( dom0_nrpages < 0 )
-        nr_pages = avail_domheap_pages() +
-            ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
-            ((image_len  + PAGE_SIZE - 1) >> PAGE_SHIFT) +
+        nr_pages = avail_domheap_pages() + initial_images_nrpages() +
             dom0_nrpages;
     else
         nr_pages = dom0_nrpages;
@@ -704,16 +701,12 @@
         hypercall_page_initialise((void *)hypercall_page);
     }
 
-    init_domheap_pages(
-        _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
-
-    /* Copy the initial ramdisk and free temporary buffer. */
+    /* Copy the initial ramdisk. */
     if ( initrd_len != 0 )
-    {
         memcpy((void *)vinitrd_start, initrd_start, initrd_len);
-        init_domheap_pages(
-            _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
-    }
+
+    /* Free temporary buffers. */
+    discard_initial_images();
 
     /* Set up start info area. */
     si = (start_info_t *)vstartinfo_start;
@@ -790,6 +783,25 @@
     {
         shadow_mode_enable(d, SHM_enable);
         update_pagetables(v);
+    }
+
+    if ( supervisor_mode_kernel )
+    {
+        v->arch.guest_context.kernel_ss &= ~3;
+        v->arch.guest_context.user_regs.ss &= ~3;
+        v->arch.guest_context.user_regs.es &= ~3;
+        v->arch.guest_context.user_regs.ds &= ~3;
+        v->arch.guest_context.user_regs.fs &= ~3;
+        v->arch.guest_context.user_regs.gs &= ~3;
+        printk("Dom0 runs in ring 0 (supervisor mode)\n");
+        if ( !test_bit(XENFEAT_supervisor_mode_kernel,
+                       dom0_features_supported) )
+            panic("Dom0 does not support supervisor-mode execution\n");
+    }
+    else
+    {
+        if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
+            panic("Dom0 requires supervisor-mode execution\n");
     }
 
     rc = 0;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/hvm.c    Wed Mar  1 19:47:25 2006
@@ -25,6 +25,7 @@
 #include <xen/sched.h>
 #include <xen/irq.h>
 #include <xen/softirq.h>
+#include <xen/domain.h>
 #include <xen/domain_page.h>
 #include <asm/current.h>
 #include <asm/io.h>
@@ -59,9 +60,9 @@
 
     for ( i = 0; i < nr_pfn; i++ )
     {
-        if ( pfn + i >= 0xfffff ) 
+        if ( pfn + i >= 0xfffff )
             break;
-        
+
         __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
     }
 }
@@ -217,7 +218,7 @@
     global_iodata_t *spg;
     u16   *virq_line, irqs;
     struct hvm_virpic *pic = &v->domain->arch.hvm_domain.vpic;
-    
+
     spg = &get_sp(v->domain)->sp_global;
     virq_line  = &spg->pic_clear_irr;
     if ( *virq_line ) {
@@ -312,6 +313,52 @@
 }
 
 /*
+ * only called in HVM domain BSP context
+ * when booting, vcpuid is always equal to apic_id
+ */
+int hvm_bringup_ap(int vcpuid, int trampoline_vector)
+{
+    struct vcpu *bsp = current, *v;
+    struct domain *d = bsp->domain;
+    struct vcpu_guest_context *ctxt;
+    int rc = 0;
+
+    /* current must be HVM domain BSP */
+    if ( !(HVM_DOMAIN(bsp) && bsp->vcpu_id == 0) ) {
+        printk("Not calling hvm_bringup_ap from BSP context.\n");
+        domain_crash_synchronous();
+    }
+
+    if ( (v = d->vcpu[vcpuid]) == NULL )
+        return -ENOENT;
+
+    if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL ) {
+        printk("Failed to allocate memory in hvm_bringup_ap.\n");
+        return -ENOMEM;
+    }
+
+    hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
+
+    LOCK_BIGLOCK(d);
+    rc = -EEXIST;
+    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+        rc = boot_vcpu(d, vcpuid, ctxt);
+    UNLOCK_BIGLOCK(d);
+
+    if ( rc != 0 )
+        printk("AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
+    else {
+        if ( test_and_clear_bit(_VCPUF_down, &d->vcpu[vcpuid]->vcpu_flags) )
+            vcpu_wake(d->vcpu[vcpuid]);
+        printk("AP %d bringup suceeded.\n", vcpuid);
+    }
+
+    xfree(ctxt);
+
+    return rc;
+}
+
+/*
  * Local variables:
  * mode: C
  * c-set-style: "BSD"
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/emulate.c
--- a/xen/arch/x86/hvm/svm/emulate.c    Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/emulate.c    Wed Mar  1 19:47:25 2006
@@ -86,7 +86,7 @@
     case 0x7:
         value = regs->edi;
         break;
-#if X86_64
+#if __x86_64__
     case 0x8:
         value = regs->r8;
         break;
@@ -318,20 +318,14 @@
 
 
 /* Get the register/mode number of src register in ModRM register. */
-unsigned int decode_dest_reg(u8 m)
-{
-#if __x86_64__
-    ASSERT(0); /* Need to adjust for REX prefix if applicable */
-#endif
-    return (m >> 3) & 7;
-}
-
-unsigned int decode_src_reg(u8 m)
-{
-#if __x86_64__
-    ASSERT(0); /* Need to adjust for REX prefix if applicable */
-#endif
-    return m & 7;
+unsigned int decode_dest_reg(u8 prefix, u8 m)
+{
+    return DECODE_MODRM_REG(prefix, m);
+}
+
+unsigned int decode_src_reg(u8 prefix, u8 m)
+{
+    return DECODE_MODRM_RM(prefix, m);
 }
 
 
@@ -431,7 +425,7 @@
  * The caller can either pass a NULL pointer to the guest_eip_buf, or a pointer
  * to enough bytes to satisfy the instruction including prefix bytes.
  */
-unsigned int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
+int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
         enum instruction_index *list, unsigned int list_count, 
         u8 *guest_eip_buf, enum instruction_index *match)
 {
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/intr.c
--- a/xen/arch/x86/hvm/svm/intr.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/intr.c       Wed Mar  1 19:47:25 2006
@@ -80,12 +80,7 @@
 {
     struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
 
-    switch(type)
-    {
-    case VLAPIC_DELIV_MODE_EXT:
-    case VLAPIC_DELIV_MODE_FIXED:
-    case VLAPIC_DELIV_MODE_LPRI:
-        if ( is_pit_irq(v, vector, type) ) {
+    if ( is_pit_irq(v, vector, type) ) {
             if ( !vpit->first_injected ) {
                 vpit->first_injected = 1;
                 vpit->pending_intr_nr = 0;
@@ -95,12 +90,15 @@
             }
             vpit->inject_point = NOW();
             svm_set_tsc_shift (v, vpit);
-        }
+    }
+
+    switch(type)
+    {
+    case VLAPIC_DELIV_MODE_EXT:
         break;
 
     default:
-        printk("Not support interrupt type: %d\n", type);
-        break;
+        vlapic_post_injection(v, vector, type);
     }
 }
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/svm.c        Wed Mar  1 19:47:25 2006
@@ -164,7 +164,7 @@
 }
 
 static inline void svm_inject_exception(struct vmcb_struct *vmcb, 
-                                        int trap, int error_code)
+                                        int trap, int ev, int error_code)
 {
     eventinj_t event;
 
@@ -172,7 +172,7 @@
     event.fields.v = 1;
     event.fields.type = EVENTTYPE_EXCEPTION;
     event.fields.vector = trap;
-    event.fields.ev = 1;
+    event.fields.ev = ev;
     event.fields.errorcode = error_code;
 
     ASSERT(vmcb->eventinj.fields.v == 0);
@@ -237,61 +237,16 @@
 }
 
 #ifdef __x86_64__
-static struct svm_msr_state percpu_msr[NR_CPUS];
-
-static u32 msr_data_index[VMX_MSR_COUNT] =
-{
-    MSR_LSTAR, MSR_STAR, MSR_CSTAR,
-    MSR_SYSCALL_MASK, MSR_EFER,
-};
 
 void svm_save_segments(struct vcpu *v)
 {
-    rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_svm.msr_content.shadow_gs);
-}
-
-/*
- * To avoid MSR save/restore at every VM exit/entry time, we restore
- * the x86_64 specific MSRs at domain switch time. Since those MSRs are
- * are not modified once set for generic domains, we don't save them,
- * but simply reset them to the values set at percpu_traps_init().
- */
+}
 void svm_load_msrs(void)
 {
-    struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
-    int i;
-
-    while ( host_state->flags )
-    {
-        i = find_first_set_bit(host_state->flags);
-        wrmsrl(msr_data_index[i], host_state->msr_items[i]);
-        clear_bit(i, &host_state->flags);
-    }
-}
-
-static void svm_save_init_msrs(void)
-{
-    struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
-    int i;
-
-    for ( i = 0; i < SVM_MSR_COUNT; i++ )
-        rdmsrl(msr_data_index[i], host_state->msr_items[i]);
-}
-
-#define CASE_READ_MSR(address)                               \
-    case MSR_ ## address:                                    \
-    msr_content = msr->msr_items[SVM_INDEX_MSR_ ## address]; \
-    break
-
-#define CASE_WRITE_MSR(address)                              \
-    case MSR_ ## address:                                    \
-    msr->msr_items[SVM_INDEX_MSR_ ## address] = msr_content; \
-    if (!test_bit(SVM_INDEX_MSR_ ## address, &msr->flags))   \
-    {                                                        \
-        set_bit(SVM_INDEX_MSR_ ## address, &msr->flags);     \
-    }                                                        \
-    break
-
+}
+void svm_restore_msrs(struct vcpu *v)
+{
+}
 
 #define IS_CANO_ADDRESS(add) 1
 
@@ -299,47 +254,45 @@
 {
     u64 msr_content = 0;
     struct vcpu *vc = current;
-    struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
+    //    struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
     struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb;
 
     switch (regs->ecx)
     {
     case MSR_EFER:
-        msr_content = msr->msr_items[SVM_INDEX_MSR_EFER];
-        HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content %llx\n", 
-                (unsigned long long)msr_content);
-
-        if (test_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state))
-            msr_content |= 1 << _EFER_LME;
-
-        if (SVM_LONG_GUEST(vc))
-            msr_content |= 1 << _EFER_LMA;
-
+        // msr_content = msr->msr_items[SVM_INDEX_MSR_EFER];
+        msr_content = vmcb->efer;      
+        msr_content &= ~EFER_SVME;
         break;
 
     case MSR_FS_BASE:
-        if (!(SVM_LONG_GUEST(vc)))
-            /* XXX should it be GP fault */
-            domain_crash_synchronous();
-        
         msr_content = vmcb->fs.base;
         break;
 
     case MSR_GS_BASE:
-        if (!(SVM_LONG_GUEST(vc)))
-            domain_crash_synchronous();
-
         msr_content = vmcb->gs.base;
         break;
 
     case MSR_SHADOW_GS_BASE:
-        msr_content = msr->shadow_gs;
-        break;
-
-    CASE_READ_MSR(STAR);
-    CASE_READ_MSR(LSTAR);
-    CASE_READ_MSR(CSTAR);
-    CASE_READ_MSR(SYSCALL_MASK);
+        msr_content = vmcb->kerngsbase;
+        break;
+
+    case MSR_STAR:
+         msr_content = vmcb->star;
+         break;
+ 
+    case MSR_LSTAR:
+         msr_content = vmcb->lstar;
+         break;
+ 
+    case MSR_CSTAR:
+         msr_content = vmcb->cstar;
+         break;
+ 
+    case MSR_SYSCALL_MASK:
+         msr_content = vmcb->sfmask;
+         break;
+
     default:
         return 0;
     }
@@ -356,8 +309,6 @@
 {
     u64 msr_content = regs->eax | ((u64)regs->edx << 32); 
     struct vcpu *vc = current;
-    struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
-    struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
     struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb;
 
     HVM_DBG_LOG(DBG_LEVEL_1, "mode_do_msr_write msr %lx msr_content %lx\n", 
@@ -373,26 +324,20 @@
                     || !test_bit(SVM_CPU_STATE_PAE_ENABLED,
                                  &vc->arch.hvm_svm.cpu_state))
             {
-                svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+                svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
             }
         }
 
         if (msr_content & EFER_LME)
             set_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state);
 
+        /* We have already recorded that we want LME, so it will be set 
+         * next time CR0 gets updated. So we clear that bit and continue.
+         */
+        if ((msr_content ^ vmcb->efer) & EFER_LME)
+            msr_content &= ~EFER_LME;  
         /* No update for LME/LMA since it have no effect */
-        msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content;
-        if (msr_content & ~(EFER_LME | EFER_LMA))
-        {
-            msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content;
-            if (!test_bit(SVM_INDEX_MSR_EFER, &msr->flags))
-            { 
-                rdmsrl(MSR_EFER, host_state->msr_items[SVM_INDEX_MSR_EFER]);
-                set_bit(SVM_INDEX_MSR_EFER, &host_state->flags);
-                set_bit(SVM_INDEX_MSR_EFER, &msr->flags);  
-                wrmsrl(MSR_EFER, msr_content);
-            }
-        }
+        vmcb->efer = msr_content | EFER_SVME;
         break;
 
     case MSR_FS_BASE:
@@ -403,63 +348,42 @@
         if (!IS_CANO_ADDRESS(msr_content))
         {
             HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
-            svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+            svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
         }
 
         if (regs->ecx == MSR_FS_BASE)
-           vmcb->fs.base = msr_content;
+            vmcb->fs.base = msr_content;
         else 
-           vmcb->gs.base = msr_content;
+            vmcb->gs.base = msr_content;
         break;
 
     case MSR_SHADOW_GS_BASE:
-        if (!(SVM_LONG_GUEST(vc)))
-            domain_crash_synchronous();
-
-        vc->arch.hvm_svm.msr_content.shadow_gs = msr_content;
-        wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
-        break;
-
-    CASE_WRITE_MSR(STAR);
-    CASE_WRITE_MSR(LSTAR);
-    CASE_WRITE_MSR(CSTAR);
-    CASE_WRITE_MSR(SYSCALL_MASK);
+         vmcb->kerngsbase = msr_content;
+         break;
+ 
+    case MSR_STAR:
+         vmcb->star = msr_content;
+         break;
+ 
+    case MSR_LSTAR:
+         vmcb->lstar = msr_content;
+         break;
+ 
+    case MSR_CSTAR:
+         vmcb->cstar = msr_content;
+         break;
+ 
+    case MSR_SYSCALL_MASK:
+         vmcb->sfmask = msr_content;
+         break;
+
     default:
         return 0;
     }
     return 1;
 }
 
-void
-svm_restore_msrs(struct vcpu *v)
-{
-    int i = 0;
-    struct svm_msr_state *guest_state;
-    struct svm_msr_state *host_state;
-    unsigned long guest_flags;
-
-    guest_state = &v->arch.hvm_svm.msr_content;;
-    host_state = &percpu_msr[smp_processor_id()];
-
-    wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
-    guest_flags = guest_state->flags;
-    if (!guest_flags)
-        return;
-
-    while (guest_flags){
-        i = find_first_set_bit(guest_flags);
-
-        HVM_DBG_LOG(DBG_LEVEL_2,
-                    "restore guest's index %d msr %lx with %lx\n",
-                    i, (unsigned long) msr_data_index[i], (unsigned long) 
guest_state->msr_items[i]);
-        set_bit(i, &host_state->flags);
-        wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
-        clear_bit(i, &guest_flags);
-    }
-}
 #else
-#define        svm_save_init_msrs()    ((void)0)
-
 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
 {
     return 0;
@@ -497,9 +421,28 @@
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
-
-    mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
+    /* check which operating mode the guest is running */
+    if( vmcb->efer & EFER_LMA )
+        mode = vmcb->cs.attributes.fields.l ? 8 : 4;
+    else
+        mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
     return svm_instrlen(guest_cpu_user_regs(), mode);
+}
+
+unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
+{
+    switch ( num )
+    {
+    case 0:
+        return v->arch.hvm_svm.cpu_shadow_cr0;
+    case 2:
+        return v->arch.hvm_svm.cpu_cr2;
+    case 3:
+        return v->arch.hvm_svm.cpu_cr3;
+    default:
+        BUG();
+    }
+    return 0;                   /* dummy */
 }
 
 int start_svm(void)
@@ -519,8 +462,6 @@
     asidpool_init(smp_processor_id());    
     printk("AMD SVM Extension is enabled for cpu %d.\n", smp_processor_id());
     
-    svm_save_init_msrs();
-
     /* Setup HVM interfaces */
     hvm_funcs.disable = stop_svm;
 
@@ -542,6 +483,7 @@
     hvm_funcs.realmode = svm_realmode;
     hvm_funcs.paging_enabled = svm_paging_enabled;
     hvm_funcs.instruction_length = svm_instruction_length;
+    hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
 
     hvm_enabled = 1;    
 
@@ -631,8 +573,17 @@
 }
 
 #if defined (__x86_64__)
-void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *c )
-{
+void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v )
+{
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
+    regs->rip    = vmcb->rip;
+    regs->rsp    = vmcb->rsp;
+    regs->rflags = vmcb->rflags;
+    regs->cs     = vmcb->cs.sel;
+    regs->ds     = vmcb->ds.sel;
+    regs->es     = vmcb->es.sel;
+    regs->ss     = vmcb->ss.sel;
 }
 #elif defined (__i386__)
 void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v)
@@ -810,7 +761,8 @@
     vpit = &v->domain->arch.hvm_domain.vpit;
     kill_timer(&vpit->pit_timer);
     kill_timer(&v->arch.hvm_svm.hlt_timer);
-    if ( hvm_apic_support(v->domain) ) {
+    if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) 
+    {
         kill_timer( &(VLAPIC(v)->vlapic_timer) );
         xfree( VLAPIC(v) );
     }
@@ -819,8 +771,29 @@
 
 void arch_svm_do_resume(struct vcpu *v) 
 {
-    svm_do_resume(v);
-    reset_stack_and_jump(svm_asm_do_resume);
+    /* pinning VCPU to a different core? */
+    if ( v->arch.hvm_svm.launch_core == smp_processor_id()) {
+        svm_do_resume( v );
+        reset_stack_and_jump( svm_asm_do_resume );
+    }
+    else {
+        printk("VCPU core pinned: %d to %d\n", v->arch.hvm_svm.launch_core, 
smp_processor_id() );
+        v->arch.hvm_svm.launch_core = smp_processor_id();
+        svm_migrate_timers( v );
+        svm_do_resume( v );
+        reset_stack_and_jump( svm_asm_do_resume );
+    }
+}
+
+
+void svm_migrate_timers(struct vcpu *v)
+{
+    struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
+
+    migrate_timer( &vpit->pit_timer, v->processor );
+    migrate_timer( &v->arch.hvm_svm.hlt_timer, v->processor );
+    if ( hvm_apic_support(v->domain) && VLAPIC( v ))
+        migrate_timer( &(VLAPIC(v)->vlapic_timer ), v->processor );
 }
 
 
@@ -860,9 +833,9 @@
        /* No support for APIC */
         if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
         { 
-            unsigned long inst_len;
-           inst_len = svm_instruction_length(v);
-            if (inst_len == (unsigned long)-1)
+            int inst_len;
+            inst_len = svm_instruction_length(v);
+            if (inst_len == -1)
             {
                 printf("%s: INST_LEN - Unable to decode properly.\n", 
__func__);
                 domain_crash_synchronous();
@@ -914,6 +887,14 @@
 
     eip = vmcb->rip;
     error_code = vmcb->exitinfo1;
+
+    if (vmcb->idtr.limit == 0) {
+        printf("Huh? We got a GP Fault with an invalid IDTR!\n");
+        svm_dump_vmcb(__func__, vmcb);
+        svm_dump_regs(__func__, regs);
+        svm_dump_inst(vmcb->rip); 
+        __hvm_bug(regs);
+    }
 
     HVM_DBG_LOG(DBG_LEVEL_1,
                 "svm_general_protection_fault: eip = %lx, erro_code = %lx",
@@ -927,7 +908,7 @@
 
     
     /* Reflect it back into the guest */
-    svm_inject_exception(vmcb, TRAP_gp_fault, error_code);
+    svm_inject_exception(vmcb, TRAP_gp_fault, 1, error_code);
 }
 
 /* Reserved bits: [31:14], [12:1] */
@@ -939,7 +920,7 @@
     unsigned int eax, ebx, ecx, edx;
     unsigned long eip;
     struct vcpu *v = current;
-    unsigned int inst_len;
+    int inst_len;
 
     ASSERT(vmcb);
 
@@ -956,21 +937,29 @@
 
     if (input == 1)
     {
+#ifndef __x86_64__
         if ( hvm_apic_support(v->domain) &&
                 !vlapic_global_enabled((VLAPIC(v))) )
+#endif
             clear_bit(X86_FEATURE_APIC, &edx);
            
-#ifdef __x86_64__
+#if CONFIG_PAGING_LEVELS < 3
+        clear_bit(X86_FEATURE_PAE, &edx);
+        clear_bit(X86_FEATURE_PSE, &edx);
+        clear_bit(X86_FEATURE_PSE36, &edx);
+#else
         if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-#endif
         {
+            if ( !v->domain->arch.hvm_domain.pae_enabled )
+                clear_bit(X86_FEATURE_PAE, &edx);
             clear_bit(X86_FEATURE_PSE, &edx);
-            clear_bit(X86_FEATURE_PAE, &edx);
             clear_bit(X86_FEATURE_PSE36, &edx);
         }
+#endif
        
         /* Clear out reserved bits. */
         ecx &= ~SVM_VCPU_CPUID_L1_RESERVED; /* mask off reserved bits */
+        clear_bit(X86_FEATURE_MWAIT & 31, &ecx);
     }
 #ifdef __i386__
     else if ( input == 0x80000001 )
@@ -991,6 +980,7 @@
             eip, input, eax, ebx, ecx, edx);
 
     inst_len = __get_instruction_length(vmcb, INSTR_CPUID, NULL);
+    ASSERT(inst_len > 0);
     __update_guest_eip(vmcb, inst_len);
 }
 
@@ -1083,9 +1073,11 @@
     unsigned long *reg_p = 0;
     unsigned int gpreg = 0;
     unsigned long eip;
-    unsigned int inst_len; 
+    int inst_len; 
+    int index;
     struct vmcb_struct *vmcb;
     u8 buffer[MAX_INST_LEN];
+    u8 prefix = 0;
 
     vmcb = v->arch.hvm_svm.vmcb;
     
@@ -1093,13 +1085,15 @@
 
     eip = vmcb->rip;
     inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
-
-    ASSERT(buffer[0] == 0x0f && (buffer[1] & 0xFD) == 0x21);
-
-    gpreg = decode_src_reg(buffer[2]);
-#if DEBUG
-    ASSERT(reg == decode_dest_reg(buffer[2]));
-#endif
+    index = skip_prefix_bytes(buffer, sizeof(buffer));
+    
+    ASSERT(buffer[index+0] == 0x0f && (buffer[index+1] & 0xFD) == 0x21);
+
+    if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
+        prefix = buffer[index-1];
+
+    gpreg = decode_src_reg(prefix, buffer[index + 2]);
+    ASSERT(reg == decode_dest_reg(prefix, buffer[index + 2]));
 
     HVM_DBG_LOG(DBG_LEVEL_1, "svm_dr_access : eip=%lx, reg=%d, gpreg = %x",
             eip, reg, gpreg);
@@ -1120,6 +1114,7 @@
         __hvm_bug(regs);
         break;
     }
+    ASSERT(inst_len > 0);
     __update_guest_eip(vmcb, inst_len);
 }
 
@@ -1335,13 +1330,13 @@
     }
 }
 
-
 static int svm_set_cr0(unsigned long value)
 {
     struct vcpu *v = current;
     unsigned long mfn;
     int paging_enabled;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    unsigned long crn;
 
     ASSERT(vmcb);
 
@@ -1377,7 +1372,7 @@
                     &v->arch.hvm_svm.cpu_state))
         {
             HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n");
-            svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+            svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
         }
 
         if (test_bit(SVM_CPU_STATE_LME_ENABLED, &v->arch.hvm_svm.cpu_state))
@@ -1386,14 +1381,7 @@
             HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n");
             set_bit(SVM_CPU_STATE_LMA_ENABLED,
                     &v->arch.hvm_svm.cpu_state);
-#if 0
-            __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
-            vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
-            __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
-#else
-           printk("Cannot yet set SVM_CPU_STATE_LMA_ENABLED\n");
-           domain_crash_synchronous();
-#endif
+            vmcb->efer |= (EFER_LMA | EFER_LME);
 
 #if CONFIG_PAGING_LEVELS >= 4 
             if (!shadow_set_guest_paging_levels(v->domain, 4)) 
@@ -1404,8 +1392,9 @@
 #endif
         }
         else
+#endif  /* __x86_64__ */
         {
-#if CONFIG_PAGING_LEVELS >= 4
+#if CONFIG_PAGING_LEVELS >= 3
             if (!shadow_set_guest_paging_levels(v->domain, 2))
             {
                 printk("Unsupported guest paging levels\n");
@@ -1414,33 +1403,18 @@
 #endif
         }
 
-#if 0
-        unsigned long crn;
-
         /* update CR4's PAE if needed */
-        __vmread(GUEST_CR4, &crn);
+        crn = vmcb->cr4;
         if ((!(crn & X86_CR4_PAE)) 
                 && test_bit(SVM_CPU_STATE_PAE_ENABLED, 
                     &v->arch.hvm_svm.cpu_state))
         {
             HVM_DBG_LOG(DBG_LEVEL_1, "enable PAE on cr4\n");
-            __vmwrite(GUEST_CR4, crn | X86_CR4_PAE);
-        }
-#else
-       printk("Cannot yet set SVM_CPU_STATE_PAE_ENABLED\n");
-       domain_crash_synchronous(); 
-#endif
-#elif defined(__i386__)
-       {
-            unsigned long old_base_mfn;
-            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-            if (old_base_mfn)
-                put_page(mfn_to_page(old_base_mfn));
-       }
-#endif
+            vmcb->cr4 |= X86_CR4_PAE;
+        }
 
         /* Now arch.guest_table points to machine physical. */
-        v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+        v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
         update_pagetables(v);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", 
@@ -1461,7 +1435,7 @@
      */
     if ((value & X86_CR0_PE) == 0) {
        if (value & X86_CR0_PG) {
-            svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+            svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
             return 0;
         }
 
@@ -1471,7 +1445,6 @@
 
     return 1;
 }
-
 
 /*
  * Read from control registers. CR0 and CR4 are read from the shadow.
@@ -1497,7 +1470,7 @@
         value = (unsigned long) v->arch.hvm_svm.cpu_cr3;
         break;
     case 4:
-        value = vmcb->cr4;
+        value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr4;
         break;
     case 8:
 #if 0
@@ -1579,7 +1552,7 @@
             }
 
             old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-            v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+            v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
 
             if (old_base_mfn)
                 put_page(mfn_to_page(old_base_mfn));
@@ -1596,12 +1569,19 @@
 
     case 4:         
         /* CR4 */
-        if (value & X86_CR4_PAE)
-            __hvm_bug(regs);    /* not implemented */
-
-        old_cr = vmcb->cr4;
-        
-        vmcb->cr4 = value;
+        if (value & X86_CR4_PAE) {
+            set_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
+        } else {
+            if (test_bit(SVM_CPU_STATE_LMA_ENABLED,
+                         &v->arch.hvm_svm.cpu_state)) {
+                svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
+            }
+            clear_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
+        }
+
+        old_cr = v->arch.hvm_svm.cpu_shadow_cr4;
+        v->arch.hvm_svm.cpu_shadow_cr4 = value;
+        vmcb->cr4 = value | SVM_CR4_HOST_MASK;
   
         /*
          * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
@@ -1630,10 +1610,12 @@
         struct cpu_user_regs *regs)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int inst_len = 0;
+    int inst_len = 0;
+    int index;
     unsigned int gpreg;
     unsigned long value;
-    u8 buffer[6];   
+    u8 buffer[MAX_INST_LEN];   
+    u8 prefix = 0;
     int result = 1;
     enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW};
     enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
@@ -1642,29 +1624,41 @@
     ASSERT(vmcb);
 
     inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
+    /* get index to first actual instruction byte - as we will need to know 
where the 
+     * prefix lives later on
+     */
+    index = skip_prefix_bytes(buffer, sizeof(buffer));
     
     if (type == TYPE_MOV_TO_CR) 
     {
         inst_len = __get_instruction_length_from_list(vmcb, list_a, 
-                ARR_SIZE(list_a), buffer, &match);
+                ARR_SIZE(list_a), &buffer[index], &match);
     }
     else
     {
         inst_len = __get_instruction_length_from_list(vmcb, list_b, 
-                ARR_SIZE(list_b), buffer, &match);
-    }
+                ARR_SIZE(list_b), &buffer[index], &match);
+    }
+
+    ASSERT(inst_len > 0);
+
+    inst_len += index;
+
+    /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
+    if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
+        prefix = buffer[index-1];
 
     HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip);
 
     switch (match) 
     {
     case INSTR_MOV2CR:
-        gpreg = decode_src_reg(buffer[2]);
+        gpreg = decode_src_reg(prefix, buffer[index+2]);
         result = mov_to_cr(gpreg, cr, regs);
         break;
 
     case INSTR_MOVCR2:
-        gpreg = decode_src_reg(buffer[2]);
+        gpreg = decode_src_reg(prefix, buffer[index+2]);
         mov_from_cr(cr, gpreg, regs);
         break;
 
@@ -1680,7 +1674,7 @@
         if (svm_dbg_on)
             svm_dump_inst(svm_rip2pointer(vmcb));
         
-        gpreg = decode_src_reg(buffer[2]);
+        gpreg = decode_src_reg(prefix, buffer[index+2]);
         value = get_reg(gpreg, regs, vmcb) & 0xF;
 
         if (svm_dbg_on)
@@ -1698,7 +1692,7 @@
     case INSTR_SMSW:
         svm_dump_inst(svm_rip2pointer(vmcb));
         value = v->arch.hvm_svm.cpu_shadow_cr0;
-        gpreg = decode_src_reg(buffer[2]);
+        gpreg = decode_src_reg(prefix, buffer[index+2]);
         set_reg(gpreg, value, regs, vmcb);
 
         if (svm_dbg_on)
@@ -1721,7 +1715,7 @@
 static inline void svm_do_msr_access(struct vcpu *v, struct cpu_user_regs 
*regs)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int  inst_len;
+    int  inst_len;
     int64_t tsc_sum;
 
     ASSERT(vmcb);
@@ -1813,7 +1807,9 @@
         next_wakeup = next_pit;
     if ( next_wakeup != - 1 )
         set_timer(&current->arch.hvm_svm.hlt_timer, next_wakeup);
+/* temporary workaround for 8828/8822 evtchn patches causing SVM failure.
     hvm_safe_block();
+*/
 }
 
 
@@ -1860,7 +1856,7 @@
     struct vcpu *v = current;
     u8 opcode[MAX_INST_SIZE], prefix, length = MAX_INST_SIZE;
     unsigned long g_vaddr;
-    unsigned int inst_len;
+    int inst_len;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
     ASSERT(vmcb);
@@ -1877,6 +1873,7 @@
     if (invlpga)
     {
         inst_len = __get_instruction_length(vmcb, INSTR_INVLPGA, opcode);
+        ASSERT(inst_len > 0);
         __update_guest_eip(vmcb, inst_len);
 
         /* 
@@ -1890,6 +1887,7 @@
         /* What about multiple prefix codes? */
         prefix = (is_prefix(opcode[0])?opcode[0]:0);
         inst_len = __get_instruction_length(vmcb, INSTR_INVLPG, opcode);
+        ASSERT(inst_len > 0);
 
         inst_len--;
         length -= inst_len;
@@ -1941,7 +1939,10 @@
     v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET;
 
     vmcb->cr2 = 0;
-    vmcb->cr4 = 0;
+    vmcb->efer = EFER_SVME;
+
+    vmcb->cr4 = SVM_CR4_HOST_MASK;
+    v->arch.hvm_svm.cpu_shadow_cr4 = 0;
 
     /* This will jump to ROMBIOS */
     vmcb->rip = 0xFFF0;
@@ -2011,12 +2012,13 @@
 static int svm_do_vmmcall(struct vcpu *v, struct cpu_user_regs *regs)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int inst_len;
+    int inst_len;
 
     ASSERT(vmcb);
     ASSERT(regs);
 
     inst_len = __get_instruction_length(vmcb, INSTR_VMCALL, NULL);
+    ASSERT(inst_len > 0);
 
     /* VMMCALL sanity check */
     if (vmcb->cpl > get_vmmcall_cpl(regs->edi))
@@ -2470,7 +2472,7 @@
         {
             v->arch.hvm_svm.injecting_event = 1;
             /* Inject #PG using Interruption-Information Fields */
-            svm_inject_exception(vmcb, TRAP_page_fault, regs.error_code);
+            svm_inject_exception(vmcb, TRAP_page_fault, 1, regs.error_code);
 
             v->arch.hvm_svm.cpu_cr2 = va;
             vmcb->cr2 = va;
@@ -2665,26 +2667,23 @@
 {
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    int core = smp_processor_id();
-    int oldcore = v->arch.hvm_svm.core; 
-    /* 
-     * if need to assign new asid or if switching cores, 
-     * then retire asid for old core, and assign new for new core.
-     */
-    if( v->arch.hvm_svm.core != core ) {
-        if (svm_dbg_on)
-            printk("old core %d new core 
%d\n",(int)v->arch.hvm_svm.core,(int)core);
-        v->arch.hvm_svm.core = core;
-    }
-    if( test_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags) ||
-          (oldcore != core)) {
-        if(!asidpool_assign_next(vmcb, 1, 
-                   oldcore, core)) {
+
+   /*
+    * if need to assign new asid, or if switching cores,
+    * retire asid for the old core, and assign a new asid to the current core.
+    */
+    if ( test_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags ) ||
+       ( v->arch.hvm_svm.asid_core != v->arch.hvm_svm.launch_core )) {
+        /* recycle asid */
+        if ( !asidpool_assign_next( vmcb, 1,
+            v->arch.hvm_svm.asid_core, v->arch.hvm_svm.launch_core )) {
             /* If we get here, we have a major problem */
             domain_crash_synchronous();
         }
-    }
-    clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
+
+        v->arch.hvm_svm.asid_core = v->arch.hvm_svm.launch_core;
+        clear_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags );
+    }
 }
 
 /*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/vmcb.c       Wed Mar  1 19:47:25 2006
@@ -190,7 +190,6 @@
     unsigned long eflags;
     unsigned long shadow_cr;
     struct vmcb_struct *vmcb = arch_svm->vmcb;
-    struct Xgt_desc_struct desc;
 
     /* Allows IRQs to be shares */
     vmcb->vintr.fields.intr_masking = 1;
@@ -224,9 +223,9 @@
     vmcb->fs.base = 0;
     vmcb->gs.base = 0;
 
-    __asm__ __volatile__ ("sidt  (%0) \n" :: "a"(&desc) : "memory");
-    vmcb->idtr.base = desc.address;
-    vmcb->idtr.limit = desc.size;
+    /* Guest Interrupt descriptor table */
+    vmcb->idtr.base = 0;
+    vmcb->idtr.limit = 0;
 
     /* Set up segment attributes */
     attrib.bytes = 0;
@@ -248,15 +247,11 @@
     attrib.fields.type = 0xb;   /* type=0xb -> executable/readable, accessed */
     vmcb->cs.attributes = attrib;
 
-    /* Global descriptor table */
-    //NMERGE7500 - can probably remove access to gdtr
-    vmcb->gdtr.base = regs->edx;
-    regs->edx = 0;
-    ASSERT(regs->eax <= 0xFFFF); /* Make sure we're in the limit */
-    vmcb->gdtr.limit = regs->eax;
-    regs->eax = 0;
-
-    /* Local Descriptor Table */
+    /* Guest Global descriptor table */
+    vmcb->gdtr.base = 0;
+    vmcb->gdtr.limit = 0;
+
+    /* Guest Local Descriptor Table */
     attrib.fields.s = 0; /* not code or data segement */
     attrib.fields.type = 0x2; /* LDT */
     attrib.fields.db = 0; /* 16-bit */
@@ -279,11 +274,10 @@
     /* CR3 is set in svm_final_setup_guest */
 
     __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) :); 
-    shadow_cr = crn;
-    vmcb->cr4 = shadow_cr;
-
-//MERGE7500 - should write a 0 instead to rsp?
-    vmcb->rsp = regs->esp;
+    arch_svm->cpu_shadow_cr4 = crn & ~(X86_CR4_PGE | X86_CR4_PSE);
+    vmcb->cr4 = crn | SVM_CR4_HOST_MASK;
+
+    vmcb->rsp = 0;
     vmcb->rip = regs->eip;
 
     eflags = regs->eflags & ~HVM_EFLAGS_RESERVED_0; /* clear 0s */
@@ -306,7 +300,7 @@
 {
     if(arch_svm->vmcb != NULL)
     {
-        asidpool_retire(arch_svm->vmcb, arch_svm->core);
+        asidpool_retire(arch_svm->vmcb, arch_svm->asid_core);
          free_vmcb(arch_svm->vmcb);
     }
     if(arch_svm->iopm != NULL) {
@@ -404,18 +398,17 @@
 
 void svm_do_launch(struct vcpu *v)
 {
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    int core = smp_processor_id();
+    ASSERT(vmcb);
+
     /* Update CR3, GDT, LDT, TR */
-    struct vmcb_struct *vmcb;
-    int core = smp_processor_id();
-    vmcb = v->arch.hvm_svm.vmcb;
-    ASSERT(vmcb);
-
     svm_stts(v);
 
-    /* current core is the one we will perform the vmrun on */
-    v->arch.hvm_svm.core = core;
+    /* current core is the one we intend to perform the VMRUN on */
+    v->arch.hvm_svm.launch_core = v->arch.hvm_svm.asid_core = core;
     clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-    if ( !asidpool_assign_next(vmcb, 0, core, core) )
+    if ( !asidpool_assign_next( vmcb, 0, core, core ))
         BUG();
 
     if (v->vcpu_id == 0)
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/x86_64/exits.S
--- a/xen/arch/x86/hvm/svm/x86_64/exits.S       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/x86_64/exits.S       Wed Mar  1 19:47:25 2006
@@ -107,8 +107,6 @@
         movq %rax, VMCB_rax(%rcx)
         movq VCPU_svm_hsa_pa(%rbx), %rax
         VMSAVE
-       /* XXX FPU SAVE */
-       /* XXX DO TSC OFFSET */
 
         movq VCPU_svm_vmcb_pa(%rbx), %rax
         popq %r15
@@ -137,9 +135,7 @@
         VMSAVE
         /* rax is the only register we're allowed to touch here... */
 
-       /* XXX FPU SAVE */
         GET_CURRENT(%rax)
-       /* XXX DO TSC OFFSET */
         movq VCPU_svm_hsa_pa(%rax), %rax
         VMLOAD
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/vlapic.c Wed Mar  1 19:47:25 2006
@@ -225,27 +225,35 @@
         break;
 
     case VLAPIC_DELIV_MODE_INIT:
-        if (!level && trig_mode == 1) {        //Deassert
+        if ( !level && trig_mode == 1 ) {        //Deassert
             printk("This hvm_vlapic is for P4, no work for De-assert init\n");
         } else {
             /* FIXME How to check the situation after vcpu reset? */
-            vlapic->init_sipi_sipi_state = 
VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
-            if (vlapic->vcpu) {
-                vcpu_pause(vlapic->vcpu);
+            if ( test_and_clear_bit(_VCPUF_initialised, &v->vcpu_flags) ) {
+                printk("Reset hvm vcpu not supported yet\n");
+                domain_crash_synchronous();
             }
+            v->arch.hvm_vcpu.init_sipi_sipi_state =
+                HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
+            result = 1;
         }
         break;
 
     case VLAPIC_DELIV_MODE_STARTUP:
-        if (vlapic->init_sipi_sipi_state != 
VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI)
+        if ( v->arch.hvm_vcpu.init_sipi_sipi_state ==
+                HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM )
             break;
-        vlapic->init_sipi_sipi_state = VLAPIC_INIT_SIPI_SIPI_STATE_NORM;
-        if (!vlapic->vcpu) {
-            /* XXX Call hvm_bringup_ap here */
-             result = 0;
-        }else{
-            //hvm_vcpu_reset(vlapic->vcpu);
-        }
+
+        v->arch.hvm_vcpu.init_sipi_sipi_state =
+                HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM;
+
+        if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) ) {
+            printk("SIPI for initialized vcpu vcpuid %x\n", v->vcpu_id);
+            domain_crash_synchronous();
+        }
+
+        if ( hvm_bringup_ap(v->vcpu_id, vector) != 0 )
+            result = 0;
         break;
 
     default:
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vmx/io.c
--- a/xen/arch/x86/hvm/vmx/io.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/vmx/io.c Wed Mar  1 19:47:25 2006
@@ -113,13 +113,15 @@
     struct hvm_virpit *vpit = &plat->vpit;
     struct hvm_virpic *pic= &plat->vpic;
 
-    hvm_pic_assist(v);
-    __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control);
-    if ( vpit->pending_intr_nr ) {
+    if ( v->vcpu_id == 0 )
+        hvm_pic_assist(v);
+
+    if ( (v->vcpu_id == 0) && vpit->pending_intr_nr ) {
         pic_set_irq(pic, 0, 0);
         pic_set_irq(pic, 0, 1);
     }
 
+    __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control);
     __vmread(VM_ENTRY_INTR_INFO_FIELD, &intr_fields);
 
     if (intr_fields & INTR_INFO_VALID_MASK) {
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Mar  1 19:47:25 2006
@@ -448,6 +448,37 @@
     return 0;                   /* dummy */
 }
 
+/* SMP VMX guest support */
+void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
+                         int vcpuid, int trampoline_vector)
+{
+    int i;
+
+    memset(ctxt, 0, sizeof(*ctxt));
+
+    /*
+     * Initial register values:
+     */
+    ctxt->user_regs.eip = VMXASSIST_BASE;
+    ctxt->user_regs.edx = vcpuid;
+    ctxt->user_regs.ebx = trampoline_vector;
+
+    ctxt->flags = VGCF_HVM_GUEST;
+
+    /* Virtual IDT is empty at start-of-day. */
+    for ( i = 0; i < 256; i++ )
+    {
+        ctxt->trap_ctxt[i].vector = i;
+        ctxt->trap_ctxt[i].cs     = FLAT_KERNEL_CS;
+    }
+
+    /* No callback handlers. */
+#if defined(__i386__)
+    ctxt->event_callback_cs     = FLAT_KERNEL_CS;
+    ctxt->failsafe_callback_cs  = FLAT_KERNEL_CS;
+#endif
+}
+
 void do_nmi(struct cpu_user_regs *);
 
 static int check_vmx_controls(ctrls, msr)
@@ -544,6 +575,8 @@
     hvm_funcs.paging_enabled = vmx_paging_enabled;
     hvm_funcs.instruction_length = vmx_instruction_length;
     hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
+
+    hvm_funcs.init_ap_context = vmx_init_ap_context;
 
     hvm_enabled = 1;
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/mm.c Wed Mar  1 19:47:25 2006
@@ -97,11 +97,11 @@
 #include <xen/domain_page.h>
 #include <xen/event.h>
 #include <xen/iocap.h>
+#include <xen/guest_access.h>
 #include <asm/shadow.h>
 #include <asm/page.h>
 #include <asm/flushtlb.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
 #include <asm/ldt.h>
 #include <asm/x86_emulate.h>
 #include <public/memory.h>
@@ -475,7 +475,8 @@
     {
         MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
                 " for dom%d",
-                mfn, get_gpfn_from_mfn(mfn), l1e_get_intpte(l1e), 
d->domain_id);
+                mfn, get_gpfn_from_mfn(mfn),
+                l1e_get_intpte(l1e), d->domain_id);
     }
 
     return okay;
@@ -515,7 +516,6 @@
 
 
 #if CONFIG_PAGING_LEVELS >= 3
-
 static int 
 get_page_from_l3e(
     l3_pgentry_t l3e, unsigned long pfn,
@@ -545,11 +545,9 @@
 #endif
     return rc;
 }
-
 #endif /* 3 level */
 
 #if CONFIG_PAGING_LEVELS >= 4
-
 static int 
 get_page_from_l4e(
     l4_pgentry_t l4e, unsigned long pfn, 
@@ -579,7 +577,6 @@
 
     return rc;
 }
-
 #endif /* 4 level */
 
 
@@ -649,27 +646,22 @@
 
 
 #if CONFIG_PAGING_LEVELS >= 3
-
 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
 {
     if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
          (l3e_get_pfn(l3e) != pfn) )
         put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
 }
-
 #endif
 
 #if CONFIG_PAGING_LEVELS >= 4
-
 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
 {
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
         put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
 }
-
 #endif
-
 
 static int alloc_l1_table(struct page_info *page)
 {
@@ -1569,43 +1561,71 @@
     int okay;
     unsigned long old_base_mfn;
 
+    ASSERT(writable_pagetable_in_sync(d));
+
     if ( shadow_mode_refcounts(d) )
+    {
         okay = get_page_from_pagenr(mfn, d);
+        if ( unlikely(!okay) )
+        {
+            MEM_LOG("Error while installing new baseptr %lx", mfn);
+            return 0;
+        }
+    }
     else
+    {
         okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
-
-    if ( likely(okay) )
-    {
-        invalidate_shadow_ldt(v);
-
-        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-        v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
-        update_pagetables(v); /* update shadow_table and monitor_table */
-
-        write_ptbase(v);
-
+        if ( unlikely(!okay) )
+        {
+            /* Switch to idle pagetable: this VCPU has no active p.t. now. */
+            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+            v->arch.guest_table = mk_pagetable(0);
+            update_pagetables(v);
+            write_cr3(__pa(idle_pg_table));
+            if ( old_base_mfn != 0 )
+                put_page_and_type(mfn_to_page(old_base_mfn));
+
+            /* Retry the validation with no active p.t. for this VCPU. */
+            okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
+            if ( !okay )
+            {
+                /* Failure here is unrecoverable: the VCPU has no pagetable! */
+                MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
+                domain_crash(d);
+                percpu_info[v->processor].deferred_ops = 0;
+                return 0;
+            }
+        }
+    }
+
+    invalidate_shadow_ldt(v);
+
+    old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+    v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+    update_pagetables(v); /* update shadow_table and monitor_table */
+
+    write_ptbase(v);
+
+    if ( likely(old_base_mfn != 0) )
+    {
         if ( shadow_mode_refcounts(d) )
             put_page(mfn_to_page(old_base_mfn));
         else
             put_page_and_type(mfn_to_page(old_base_mfn));
-
-        /* CR3 also holds a ref to its shadow... */
-        if ( shadow_mode_enabled(d) )
-        {
-            if ( v->arch.monitor_shadow_ref )
-                put_shadow_ref(v->arch.monitor_shadow_ref);
-            v->arch.monitor_shadow_ref =
-                pagetable_get_pfn(v->arch.monitor_table);
-            ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
-            get_shadow_ref(v->arch.monitor_shadow_ref);
-        }
-    }
-    else
-    {
-        MEM_LOG("Error while installing new baseptr %lx", mfn);
-    }
-
-    return okay;
+    }
+
+    /* CR3 also holds a ref to its shadow... */
+    if ( shadow_mode_enabled(d) )
+    {
+        if ( v->arch.monitor_shadow_ref )
+            put_shadow_ref(v->arch.monitor_shadow_ref);
+        v->arch.monitor_shadow_ref =
+            pagetable_get_pfn(v->arch.monitor_table);
+        ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
+        get_shadow_ref(v->arch.monitor_shadow_ref);
+    }
+
+    return 1;
 }
 
 static void process_deferred_ops(unsigned int cpu)
@@ -1625,7 +1645,7 @@
         else
             local_flush_tlb();
     }
-        
+
     if ( deferred_ops & DOP_RELOAD_LDT )
         (void)map_ldt_shadow_page(0);
 
@@ -1752,9 +1772,9 @@
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall4_create_continuation(
-                __HYPERVISOR_mmuext_op, uops,
-                (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_mmuext_op, "pipi",
+                uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
             break;
         }
 
@@ -2018,9 +2038,9 @@
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall4_create_continuation(
-                __HYPERVISOR_mmu_update, ureqs, 
-                (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_mmu_update, "pipi",
+                ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
             break;
         }
 
@@ -2769,7 +2789,7 @@
 }
 
 
-long arch_memory_op(int op, void *arg)
+long arch_memory_op(int op, GUEST_HANDLE(void) arg)
 {
     struct xen_reserved_phys_area xrpa;
     unsigned long pfn;
@@ -2779,7 +2799,7 @@
     switch ( op )
     {
     case XENMEM_reserved_phys_area:
-        if ( copy_from_user(&xrpa, arg, sizeof(xrpa)) )
+        if ( copy_from_guest(&xrpa, arg, 1) )
             return -EFAULT;
 
         /* No guest has more than one reserved area. */
@@ -2813,7 +2833,7 @@
 
         put_domain(d);
 
-        if ( copy_to_user(arg, &xrpa, sizeof(xrpa)) )
+        if ( copy_to_guest(arg, &xrpa, 1) )
             return -EFAULT;
 
         break;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/setup.c      Wed Mar  1 19:47:25 2006
@@ -144,6 +144,20 @@
 
 static struct e820entry e820_raw[E820MAX];
 
+static unsigned long initial_images_start, initial_images_end;
+
+unsigned long initial_images_nrpages(void)
+{
+    unsigned long s = initial_images_start + PAGE_SIZE - 1;
+    unsigned long e = initial_images_end;
+    return ((e >> PAGE_SHIFT) - (s >> PAGE_SHIFT));
+}
+
+void discard_initial_images(void)
+{
+    init_domheap_pages(initial_images_start, initial_images_end);
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char *cmdline;
@@ -152,7 +166,6 @@
     unsigned int initrdidx = 1;
     module_t *mod = (module_t *)__va(mbi->mods_addr);
     unsigned long nr_pages, modules_length;
-    unsigned long initial_images_start, initial_images_end;
     paddr_t s, e;
     int i, e820_warn = 0, e820_raw_nr = 0, bytes = 0;
     struct ns16550_defaults ns16550 = {
@@ -437,11 +450,7 @@
         set_in_cr4(X86_CR4_OSXMMEXCPT);
 
     if ( opt_nosmp )
-    {
         max_cpus = 0;
-        smp_num_siblings = 1;
-        boot_cpu_data.x86_max_cores = 1;
-    }
 
     smp_prepare_cpus(max_cpus);
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/shadow32.c
--- a/xen/arch/x86/shadow32.c   Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/shadow32.c   Wed Mar  1 19:47:25 2006
@@ -43,7 +43,8 @@
 static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned 
long gpfn);
 #endif
 
-static void free_p2m_table(struct vcpu *v);
+static int alloc_p2m_table(struct domain *d);
+static void free_p2m_table(struct domain *d);
 
 /********
 
@@ -739,7 +740,7 @@
     mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
     memset(mpl2e, 0, PAGE_SIZE);
 
-    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
            &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
            HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
 
@@ -760,6 +761,23 @@
 
     if ( v->vcpu_id == 0 )
         alloc_p2m_table(d);
+    else
+    {
+        unsigned long mfn;
+
+        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+        if ( mfn )
+        {
+            l2_pgentry_t *l2tab;
+
+            l2tab = map_domain_page(mfn);
+
+            mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+                l2tab[l2_table_offset(RO_MPT_VIRT_START)];
+
+            unmap_domain_page(l2tab);
+        }
+    }
 }
 
 /*
@@ -771,7 +789,7 @@
     unsigned long mfn;
 
     ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
-    
+
     mpl2e = v->arch.monitor_vtable;
 
     /*
@@ -794,7 +812,7 @@
     }
 
     if ( v->vcpu_id == 0 )
-        free_p2m_table(v);
+        free_p2m_table(v->domain);
 
     /*
      * Then free monitor_table.
@@ -808,8 +826,8 @@
 }
 
 static int
-map_p2m_entry(
-    l1_pgentry_t *l1tab, unsigned long va, unsigned long gpa, unsigned long 
mfn)
+map_p2m_entry(l1_pgentry_t *l1tab, unsigned long va,
+              unsigned long gpa, unsigned long mfn)
 {
     unsigned long *l0tab = NULL;
     l1_pgentry_t l1e = { 0 };
@@ -820,27 +838,22 @@
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
-            goto fail;
-
-        if ( l0tab  )
-            unmap_domain_page(l0tab);
+            return 0;
+
         l0tab = map_domain_page(page_to_mfn(page));
-        memset(l0tab, 0, PAGE_SIZE );
+        memset(l0tab, 0, PAGE_SIZE);
+
         l1e = l1tab[l1_table_offset(va)] =
             l1e_from_page(page, __PAGE_HYPERVISOR);
     }
-    else if ( l0tab == NULL)
+    else
         l0tab = map_domain_page(l1e_get_pfn(l1e));
 
-    l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
-
-    if ( l0tab )
-        unmap_domain_page(l0tab);
+    l0tab[gpa & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn;
+
+    unmap_domain_page(l0tab);
 
     return 1;
-
-fail:
-    return 0;
 }
 
 int
@@ -853,7 +866,6 @@
     l1_pgentry_t *l1;
     struct page_info *l1page;
     unsigned long va = pfn << PAGE_SHIFT;
-    int error;
 
     if ( shadow_mode_external(d) )
     {
@@ -877,6 +889,7 @@
 
     if ( shadow_mode_external(d) )
     {
+        int error;
         l1_pgentry_t *l1tab = NULL;
         l2_pgentry_t l2e;
 
@@ -885,14 +898,13 @@
         ASSERT( l2e_get_flags(l2e) & _PAGE_PRESENT );
 
         l1tab = map_domain_page(l2e_get_pfn(l2e));
-        error = map_p2m_entry(l1tab, va, pfn, mfn);
-        if ( !error )
-            domain_crash_synchronous(); 
+        if ( !(error = map_p2m_entry(l1tab, va, pfn, mfn)) )
+            domain_crash(d);
 
         unmap_domain_page(l1tab);
         unmap_domain_page_with_cache(l2, l2cache);
 
-        return 1;
+        return error;
     }
 
     /*
@@ -926,7 +938,7 @@
     return 1;
 }
 
-int
+static int
 alloc_p2m_table(struct domain *d)
 {
     struct list_head *list_ent;
@@ -937,7 +949,7 @@
     l2_pgentry_t l2e = { 0 };
     struct page_info *page;
     unsigned long gpfn, mfn;
-    int error;
+    int error = 0;
 
     if ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) )
     {
@@ -955,6 +967,9 @@
         }
         else
             l1tab = map_domain_page(l2e_get_pfn(l2e));
+
+        if ( l2tab )
+            unmap_domain_page(l2tab);
     }
     else
     {
@@ -972,23 +987,23 @@
         page = list_entry(list_ent, struct page_info, list);
         mfn = page_to_mfn(page);
 
-        error = map_p2m_entry(l1tab, va, gpfn, mfn);
-        if ( !error )
-            domain_crash_synchronous(); 
+        if ( !(error = map_p2m_entry(l1tab, va, gpfn, mfn)) )
+        {
+            domain_crash(d);
+            break;
+        }
 
         list_ent = frame_table[mfn].list.next;
         va += sizeof(mfn);
     }
 
-    if (l2tab)
-        unmap_domain_page(l2tab);
     unmap_domain_page(l1tab);
 
-    return 1;
-}
-
-static void 
-free_p2m_table(struct vcpu *v)
+    return error;
+}
+
+static void
+free_p2m_table(struct domain *d)
 {
     unsigned long va;
     l2_pgentry_t *l2tab;
@@ -996,10 +1011,10 @@
     l2_pgentry_t l2e;
     l1_pgentry_t l1e;
 
-    ASSERT ( pagetable_get_pfn(v->arch.monitor_table) );
+    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 
     l2tab = map_domain_page(
-        pagetable_get_pfn(v->arch.monitor_table));
+        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 
     for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
     {
@@ -1015,11 +1030,13 @@
 
                 if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
                     free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
-                va += PAGE_SIZE; 
+                va += PAGE_SIZE;
             }
             unmap_domain_page(l1tab);
             free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
         }
+        else
+            va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
     }
     unmap_domain_page(l2tab);
 }
@@ -1246,7 +1263,7 @@
 
     if ( shadow_mode_refcounts(d) )
     {
-        struct list_head *list_ent; 
+        struct list_head *list_ent;
         struct page_info *page;
 
         /*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/shadow_public.c
--- a/xen/arch/x86/shadow_public.c      Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/shadow_public.c      Wed Mar  1 19:47:25 2006
@@ -31,7 +31,8 @@
 #include <xen/trace.h>
 #include <asm/shadow_64.h>
 
-static void free_p2m_table(struct vcpu *v);
+static int alloc_p2m_table(struct domain *d);
+static void free_p2m_table(struct domain *d);
 
 #define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) - 
((_encoded) >> 16))
 
@@ -328,6 +329,23 @@
 
     if ( v->vcpu_id == 0 )
         alloc_p2m_table(d);
+    else
+    {
+        unsigned long mfn;
+
+        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+        if ( mfn )
+        {
+            l4_pgentry_t *l4tab;
+
+            l4tab = map_domain_page(mfn);
+
+            mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
+                l4tab[l4_table_offset(RO_MPT_VIRT_START)];
+
+            unmap_domain_page(l4tab);
+        }
+    }
 }
 
 void free_monitor_pagetable(struct vcpu *v)
@@ -338,7 +356,7 @@
      * free monitor_table.
      */
     if ( v->vcpu_id == 0 )
-        free_p2m_table(v);
+        free_p2m_table(v->domain);
 
     /*
      * Then free monitor_table.
@@ -397,13 +415,49 @@
             l2e_empty();
     mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = l2e_empty();
 
-    unmap_domain_page(mpl2e);
-
     v->arch.monitor_table = mk_pagetable(m3mfn << PAGE_SHIFT); /* < 4GB */
     v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e;
 
     if ( v->vcpu_id == 0 )
         alloc_p2m_table(d);
+    else
+    {
+        unsigned long mfn;
+
+        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+        if ( mfn )
+        {
+            l3_pgentry_t *l3tab, l3e;
+            l2_pgentry_t *l2tab;
+
+            l3tab = map_domain_page(mfn);
+            l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
+
+            /*
+             * NB: when CONFIG_PAGING_LEVELS == 3,
+             * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
+             * alloc_monitor_pagetable should guarantee this.
+             */
+            if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+                BUG();
+
+            l2tab = map_domain_page(l3e_get_pfn(l3e));
+
+            /*
+             * Just one l2 slot is used here, so at most 2M for p2m table:
+             *      ((4K * 512)/sizeof(unsigned long)) * 4K = 2G
+             * should be OK on PAE xen, since Qemu DM can only map 1.5G VMX
+             * guest memory.
+             */
+            mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+                l2tab[l2_table_offset(RO_MPT_VIRT_START)];
+
+            unmap_domain_page(l2tab);
+            unmap_domain_page(l3tab);
+        }
+    }
+
+    unmap_domain_page(mpl2e);
 }
 
 void free_monitor_pagetable(struct vcpu *v)
@@ -413,7 +467,7 @@
      * free monitor_table.
      */
     if ( v->vcpu_id == 0 )
-        free_p2m_table(v);
+        free_p2m_table(v->domain);
 
     m3mfn = pagetable_get_pfn(v->arch.monitor_table);
     m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]);
@@ -1348,14 +1402,14 @@
 }
 
 static int
-map_p2m_entry(
-    pgentry_64_t *top_tab, unsigned long va, unsigned long gpa, unsigned long 
mfn)
+map_p2m_entry(pgentry_64_t *top_tab, unsigned long va,
+              unsigned long gpfn, unsigned long mfn)
 {
 #if CONFIG_PAGING_LEVELS >= 4
     pgentry_64_t l4e = { 0 };
+    pgentry_64_t *l3tab = NULL;
 #endif
 #if CONFIG_PAGING_LEVELS >= 3
-    pgentry_64_t *l3tab = NULL;
     pgentry_64_t l3e = { 0 };
 #endif
     l2_pgentry_t *l2tab = NULL;
@@ -1367,7 +1421,7 @@
 
 #if CONFIG_PAGING_LEVELS >= 4
     l4e = top_tab[l4_table_offset(va)];
-    if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) ) 
+    if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
@@ -1375,17 +1429,14 @@
 
         l3tab = map_domain_page(page_to_mfn(page));
         memset(l3tab, 0, PAGE_SIZE);
-        l4e = top_tab[l4_table_offset(va)] = 
+        l4e = top_tab[l4_table_offset(va)] =
             entry_from_page(page, __PAGE_HYPERVISOR);
-    } 
-    else if ( l3tab == NULL)
+    }
+    else
         l3tab = map_domain_page(entry_get_pfn(l4e));
 
     l3e = l3tab[l3_table_offset(va)];
-#else
-    l3e = top_tab[l3_table_offset(va)];
-#endif
-    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) ) 
+    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
@@ -1393,14 +1444,29 @@
 
         l2tab = map_domain_page(page_to_mfn(page));
         memset(l2tab, 0, PAGE_SIZE);
-        l3e = l3tab[l3_table_offset(va)] = 
+        l3e = l3tab[l3_table_offset(va)] =
             entry_from_page(page, __PAGE_HYPERVISOR);
-    } 
-    else if ( l2tab == NULL) 
+    }
+    else
         l2tab = map_domain_page(entry_get_pfn(l3e));
 
+    unmap_domain_page(l3tab);
+#else
+    l3e = top_tab[l3_table_offset(va)];
+
+    /*
+     * NB: when CONFIG_PAGING_LEVELS == 3,
+     * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
+     * alloc_monitor_pagetable should guarantee this.
+     */
+    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
+        BUG();
+
+    l2tab = map_domain_page(entry_get_pfn(l3e));
+#endif
+
     l2e = l2tab[l2_table_offset(va)];
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) 
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
@@ -1408,14 +1474,16 @@
 
         l1tab = map_domain_page(page_to_mfn(page));
         memset(l1tab, 0, PAGE_SIZE);
-        l2e = l2tab[l2_table_offset(va)] = 
+        l2e = l2tab[l2_table_offset(va)] =
             l2e_from_page(page, __PAGE_HYPERVISOR);
-    } 
-    else if ( l1tab == NULL) 
+    }
+    else
         l1tab = map_domain_page(l2e_get_pfn(l2e));
 
+    unmap_domain_page(l2tab);
+
     l1e = l1tab[l1_table_offset(va)];
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) 
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
@@ -1423,96 +1491,88 @@
 
         l0tab = map_domain_page(page_to_mfn(page));
         memset(l0tab, 0, PAGE_SIZE);
-        l1e = l1tab[l1_table_offset(va)] = 
+        l1e = l1tab[l1_table_offset(va)] =
             l1e_from_page(page, __PAGE_HYPERVISOR);
     }
-    else if ( l0tab == NULL) 
+    else
         l0tab = map_domain_page(l1e_get_pfn(l1e));
 
-    l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
-
-    if ( l2tab )
-    {
-        unmap_domain_page(l2tab);
-        l2tab = NULL;
-    }
-    if ( l1tab )
-    {
-        unmap_domain_page(l1tab);
-        l1tab = NULL;
-    }
-    if ( l0tab )
-    {
-        unmap_domain_page(l0tab);
-        l0tab = NULL;
-    }
+    unmap_domain_page(l1tab);
+
+    l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
+
+    unmap_domain_page(l0tab);
 
     return 1;
 
 nomem:
-
     return 0;
 }
 
 int
-set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
+set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn,
               struct domain_mmap_cache *l2cache,
               struct domain_mmap_cache *l1cache)
 {
-    unsigned long tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
-    pgentry_64_t *top;
-    unsigned long va = RO_MPT_VIRT_START + (pfn * sizeof (unsigned long));
+    unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+    unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(unsigned long));
+    pgentry_64_t *top_tab;
     int error;
 
-    ASSERT(tabpfn != 0);
+    ASSERT(tabmfn != 0);
     ASSERT(shadow_lock_is_acquired(d));
 
-    top = map_domain_page_with_cache(tabpfn, l2cache);
-    error = map_p2m_entry(top, va, pfn, mfn);
-    unmap_domain_page_with_cache(top, l2cache);
-
-    if ( !error )
-         domain_crash_synchronous();
-        
-    return 1;
-}
-
-int
+    top_tab = map_domain_page_with_cache(tabmfn, l2cache);
+
+    if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
+        domain_crash(d);
+
+    unmap_domain_page_with_cache(top_tab, l2cache);
+
+    return error;
+}
+
+static int
 alloc_p2m_table(struct domain *d)
 {
     struct list_head *list_ent;
     unsigned long va = RO_MPT_VIRT_START; /*  phys_to_machine_mapping */
     pgentry_64_t *top_tab = NULL;
     unsigned long mfn;
-    int gpa;
-
-    ASSERT ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
+    int gpfn, error = 0;
+
+    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 
     top_tab = map_domain_page(
         pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 
-
     list_ent = d->page_list.next;
 
-    for ( gpa = 0; list_ent != &d->page_list; gpa++ ) 
+    for ( gpfn = 0; list_ent != &d->page_list; gpfn++ )
     {
         struct page_info *page;
+
         page = list_entry(list_ent, struct page_info, list);
         mfn = page_to_mfn(page);
 
-        map_p2m_entry(top_tab, va, gpa, mfn);
+        if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
+        {
+            domain_crash(d);
+            break;
+        }
+
         list_ent = frame_table[mfn].list.next;
         va += sizeof(mfn);
     }
 
     unmap_domain_page(top_tab);
 
-    return 1;
+    return error;
 }
 
 #if CONFIG_PAGING_LEVELS >= 3
 static void
-free_p2m_table(struct vcpu *v)
+free_p2m_table(struct domain *d)
 {
     unsigned long va;
     l1_pgentry_t *l1tab;
@@ -1520,27 +1580,35 @@
     l2_pgentry_t *l2tab;
     l2_pgentry_t l2e;
 #if CONFIG_PAGING_LEVELS >= 3
-    l3_pgentry_t *l3tab; 
+    l3_pgentry_t *l3tab;
     l3_pgentry_t l3e;
 #endif
 #if CONFIG_PAGING_LEVELS == 4
     int i3;
-    l4_pgentry_t *l4tab; 
+    l4_pgentry_t *l4tab;
     l4_pgentry_t l4e;
 #endif
 
-    ASSERT ( pagetable_get_pfn(v->arch.monitor_table) );
+    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 
 #if CONFIG_PAGING_LEVELS == 4
     l4tab = map_domain_page(
-        pagetable_get_pfn(v->arch.monitor_table));
+        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 #endif
 #if CONFIG_PAGING_LEVELS == 3
     l3tab = map_domain_page(
-        pagetable_get_pfn(v->arch.monitor_table));
-
-    va = RO_MPT_VIRT_START;
-    l3e = l3tab[l3_table_offset(va)];
+        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
+
+    l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
+
+    /*
+     * NB: when CONFIG_PAGING_LEVELS == 3,
+     * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
+     * alloc_monitor_pagetable should guarantee this.
+     */
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+        BUG();
+
     l2tab = map_domain_page(l3e_get_pfn(l3e));
 #endif
 
@@ -1555,8 +1623,8 @@
 
             for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ )
             {
-
                 l3e = l3tab[l3_table_offset(va)];
+
                 if ( l3e_get_flags(l3e) & _PAGE_PRESENT )
                 {
                     int i2;
@@ -1567,12 +1635,13 @@
                     {
 #endif
                         l2e = l2tab[l2_table_offset(va)];
+
                         if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
                         {
                             int i1;
 
                             l1tab = map_domain_page(l2e_get_pfn(l2e));
-                            
+
                             /*
                              * unsigned long phys_to_machine_mapping[]
                              */
@@ -1591,7 +1660,7 @@
                         else
                             va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
 
-#if CONFIG_PAGING_LEVELS == 4                    
+#if CONFIG_PAGING_LEVELS == 4
                     }
                     unmap_domain_page(l2tab);
                     free_domheap_page(mfn_to_page(l3e_get_pfn(l3e)));
@@ -1603,7 +1672,7 @@
             free_domheap_page(mfn_to_page(l4e_get_pfn(l4e)));
         }
         else
-            va += PAGE_SIZE * 
+            va += PAGE_SIZE *
                 L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES * 
L3_PAGETABLE_ENTRIES;
 #endif
     }
@@ -1622,7 +1691,7 @@
     paddr_t pa, l1_pgentry_t gpte,
     struct domain_mmap_cache *cache)
 {
-    unsigned long sl1mfn;    
+    unsigned long sl1mfn;
     l1_pgentry_t *spl1e, spte;
 
     shadow_lock(d);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/traps.c      Wed Mar  1 19:47:25 2006
@@ -951,6 +951,7 @@
             
         case 3: /* Write CR3 */
             LOCK_BIGLOCK(v->domain);
+            cleanup_writable_pagetable(v->domain);
             (void)new_guest_cr3(gmfn_to_mfn(v->domain, paddr_to_pfn(*reg)));
             UNLOCK_BIGLOCK(v->domain);
             break;
@@ -1002,7 +1003,6 @@
 #endif
         default:
             if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
-                 (regs->ecx != MSR_EFER) ||
                  (regs->eax != l) || (regs->edx != h) )
                 DPRINTK("Domain attempted WRMSR %p from "
                         "%08x:%08x to %08lx:%08lx.\n",
@@ -1033,8 +1033,8 @@
                 goto fail;
             break;
         default:
-            DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));
             /* Everyone can read the MSR space. */
+            /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
             if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
                 goto fail;
             break;
@@ -1416,8 +1416,8 @@
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall1_create_continuation(
-                __HYPERVISOR_set_trap_table, traps);
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_set_trap_table, "p", traps);
             break;
         }
 
@@ -1430,7 +1430,7 @@
         if ( cur.address == 0 )
             break;
 
-        fixup_guest_selector(cur.cs);
+        fixup_guest_code_selector(cur.cs);
 
         memcpy(&dst[cur.vector], &cur, sizeof(cur));
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/asm-offsets.c Wed Mar  1 19:47:25 2006
@@ -72,6 +72,13 @@
     DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
     BLANK();
 
+    OFFSET(TSS_ss0, struct tss_struct, ss0);
+    OFFSET(TSS_esp0, struct tss_struct, esp0);
+    OFFSET(TSS_ss1, struct tss_struct, ss1);
+    OFFSET(TSS_esp1, struct tss_struct, esp1);
+    DEFINE(TSS_sizeof, sizeof(struct tss_struct));
+    BLANK();
+
     OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa);
     OFFSET(VCPU_svm_hsa_pa,  struct vcpu, arch.hvm_svm.host_save_pa);
     OFFSET(VCPU_svm_vmcb, struct vcpu, arch.hvm_svm.vmcb);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/entry.S       Wed Mar  1 19:47:25 2006
@@ -77,6 +77,13 @@
 restore_all_guest:
         testl $X86_EFLAGS_VM,UREGS_eflags(%esp)
         jnz  restore_all_vm86
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+        testl $2,UREGS_cs(%esp)
+        jnz   1f
+        call  restore_ring0_guest
+        jmp   restore_all_vm86
+1:
+#endif
 FLT1:   mov  UREGS_ds(%esp),%ds
 FLT2:   mov  UREGS_es(%esp),%es
 FLT3:   mov  UREGS_fs(%esp),%fs
@@ -157,6 +164,7 @@
         ALIGN
 ENTRY(hypercall)
         subl $4,%esp
+        FIXUP_RING0_GUEST_STACK
        SAVE_ALL(b)
         sti
         GET_CURRENT(%ebx)
@@ -294,6 +302,11 @@
         popl %eax
         shll $16,%eax                    # Bits 16-23: saved_upcall_mask
         movw UREGS_cs+4(%esp),%ax        # Bits  0-15: CS
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+        testw $2,%ax
+        jnz  FLT15
+        and  $~3,%ax                     # RPL 1 -> RPL 0
+#endif
 FLT15:  movl %eax,%gs:4(%esi) 
         test $0x00FF0000,%eax            # Bits 16-23: saved_upcall_mask
         setz %ch                         # %ch == !saved_upcall_mask
@@ -388,6 +401,7 @@
        pushl $TRAP_divide_error<<16
        ALIGN
 error_code:
+        FIXUP_RING0_GUEST_STACK
         SAVE_ALL_NOSEGREGS(a)
         SET_XEN_SEGMENTS(a)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%esp)
@@ -505,6 +519,10 @@
        jmp error_code
 
 ENTRY(nmi)
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+        # NMI entry protocol is incompatible with guest kernel in ring 0.
+        iret
+#else
         # Save state but do not trash the segment registers!
         # We may otherwise be unable to reload them or copy them to ring 1. 
        pushl %eax
@@ -546,6 +564,7 @@
         movl  $(APIC_DM_FIXED | APIC_DEST_SELF | APIC_DEST_LOGICAL | \
                 TRAP_deferred_nmi),%ss:APIC_ICR(%eax)
         jmp   restore_all_xen
+#endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */
 
 ENTRY(setup_vm86_frame)
         # Copies the entire stack frame forwards by 16 bytes.
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/mm.c
--- a/xen/arch/x86/x86_32/mm.c  Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/mm.c  Wed Mar  1 19:47:25 2006
@@ -23,6 +23,7 @@
 #include <xen/init.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <xen/guest_access.h>
 #include <asm/current.h>
 #include <asm/page.h>
 #include <asm/flushtlb.h>
@@ -180,9 +181,18 @@
             page_set_owner(page, dom_xen);
         }
     }
-}
-
-long subarch_memory_op(int op, void *arg)
+
+    if ( supervisor_mode_kernel )
+    {
+        /* Guest kernel runs in ring 0, not ring 1. */
+        struct desc_struct *d;
+        d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
+        d[0].b &= ~_SEGMENT_DPL;
+        d[1].b &= ~_SEGMENT_DPL;
+    }
+}
+
+long subarch_memory_op(int op, GUEST_HANDLE(void) arg)
 {
     struct xen_machphys_mfn_list xmml;
     unsigned long mfn;
@@ -192,7 +202,7 @@
     switch ( op )
     {
     case XENMEM_machphys_mfn_list:
-        if ( copy_from_user(&xmml, arg, sizeof(xmml)) )
+        if ( copy_from_guest(&xmml, arg, 1) )
             return -EFAULT;
 
         max = min_t(unsigned int, xmml.max_extents, mpt_size >> 21);
@@ -201,11 +211,12 @@
         {
             mfn = l2e_get_pfn(idle_pg_table_l2[l2_linear_offset(
                 RDWR_MPT_VIRT_START + (i << 21))]) + l1_table_offset(i << 21);
-            if ( put_user(mfn, &xmml.extent_start[i]) )
+            if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
                 return -EFAULT;
         }
 
-        if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) )
+        xmml.nr_extents = i;
+        if ( copy_to_guest(arg, &xmml, 1) )
             return -EFAULT;
 
         break;
@@ -223,7 +234,7 @@
     int nr = smp_processor_id();
     struct tss_struct *t = &init_tss[nr];
 
-    fixup_guest_selector(ss);
+    fixup_guest_stack_selector(ss);
 
     current->arch.guest_context.kernel_ss = ss;
     current->arch.guest_context.kernel_sp = esp;
@@ -239,6 +250,10 @@
     unsigned long base, limit;
     u32 a = d->a, b = d->b;
     u16 cs;
+
+    /* Let a ring0 guest kernel set any descriptor it wants to. */
+    if ( supervisor_mode_kernel )
+        return 1;
 
     /* A not-present descriptor will always fault, so is safe. */
     if ( !(b & _SEGMENT_P) ) 
@@ -273,7 +288,7 @@
 
         /* Validate and fix up the target code selector. */
         cs = a >> 16;
-        fixup_guest_selector(cs);
+        fixup_guest_code_selector(cs);
         if ( !guest_gate_selector_okay(cs) )
             goto bad;
         a = d->a = (d->a & 0xffffU) | (cs << 16);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/traps.c       Wed Mar  1 19:47:25 2006
@@ -256,8 +256,14 @@
      * We can't virtualise interrupt gates, as there's no way to get
      * the CPU to automatically clear the events_mask variable. Also we
      * must ensure that the CS is safe to poke into an interrupt gate.
-     */
-    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) )
+     *
+     * When running with supervisor_mode_kernel enabled a direct trap
+     * to the guest OS cannot be used because the INT instruction will
+     * switch to the Xen stack and we need to swap back to the guest
+     * kernel stack before passing control to the system call entry point.
+     */
+    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) ||
+         supervisor_mode_kernel )
     {
         v->arch.int80_desc.a = v->arch.int80_desc.b = 0;
         return;
@@ -278,8 +284,8 @@
 {
     struct vcpu *d = current;
 
-    fixup_guest_selector(event_selector);
-    fixup_guest_selector(failsafe_selector);
+    fixup_guest_code_selector(event_selector);
+    fixup_guest_code_selector(failsafe_selector);
 
     d->arch.guest_context.event_callback_cs     = event_selector;
     d->arch.guest_context.event_callback_eip    = event_address;
@@ -289,12 +295,51 @@
     return 0;
 }
 
-void hypercall_page_initialise(void *hypercall_page)
-{
+static void hypercall_page_initialise_ring0_kernel(void *hypercall_page)
+{
+    extern asmlinkage int hypercall(void);
     char *p;
     int i;
 
     /* Fill in all the transfer points with template machine code. */
+
+    for ( i = 0; i < NR_hypercalls; i++ )
+    {
+        p = (char *)(hypercall_page + (i * 32));
+
+        *(u8  *)(p+ 0) = 0x9c;      /* pushf */
+        *(u8  *)(p+ 1) = 0xfa;      /* cli */
+        *(u8  *)(p+ 2) = 0xb8;      /* mov $<i>,%eax */
+        *(u32 *)(p+ 3) = i;
+        *(u8  *)(p+ 7) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
+        *(u32 *)(p+ 8) = (u32)&hypercall;
+        *(u16 *)(p+12) = (u16)__HYPERVISOR_CS;
+        *(u8  *)(p+14) = 0xc3;      /* ret */
+    }
+
+    /*
+     * HYPERVISOR_iret is special because it doesn't return and expects a
+     * special stack frame. Guests jump at this transfer point instead of
+     * calling it.
+     */
+    p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
+    *(u8  *)(p+ 0) = 0x50;      /* push %eax */
+    *(u8  *)(p+ 1) = 0x9c;      /* pushf */
+    *(u8  *)(p+ 2) = 0xfa;      /* cli */
+    *(u8  *)(p+ 3) = 0xb8;      /* mov $<i>,%eax */
+    *(u32 *)(p+ 4) = __HYPERVISOR_iret;
+    *(u8  *)(p+ 8) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
+    *(u32 *)(p+ 9) = (u32)&hypercall;
+    *(u16 *)(p+13) = (u16)__HYPERVISOR_CS;
+}
+
+static void hypercall_page_initialise_ring1_kernel(void *hypercall_page)
+{
+    char *p;
+    int i;
+
+    /* Fill in all the transfer points with template machine code. */
+
     for ( i = 0; i < (PAGE_SIZE / 32); i++ )
     {
         p = (char *)(hypercall_page + (i * 32));
@@ -314,6 +359,14 @@
     *(u8  *)(p+ 1) = 0xb8;    /* mov  $__HYPERVISOR_iret,%eax */
     *(u32 *)(p+ 2) = __HYPERVISOR_iret;
     *(u16 *)(p+ 6) = 0x82cd;  /* int  $0x82 */
+}
+
+void hypercall_page_initialise(void *hypercall_page)
+{
+    if ( supervisor_mode_kernel )
+        hypercall_page_initialise_ring0_kernel(hypercall_page);
+    else
+        hypercall_page_initialise_ring1_kernel(hypercall_page);
 }
 
 /*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_64/mm.c
--- a/xen/arch/x86/x86_64/mm.c  Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_64/mm.c  Wed Mar  1 19:47:25 2006
@@ -22,6 +22,7 @@
 #include <xen/init.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <xen/guest_access.h>
 #include <asm/current.h>
 #include <asm/asm_defns.h>
 #include <asm/page.h>
@@ -182,7 +183,7 @@
     }
 }
 
-long subarch_memory_op(int op, void *arg)
+long subarch_memory_op(int op, GUEST_HANDLE(void) arg)
 {
     struct xen_machphys_mfn_list xmml;
     l3_pgentry_t l3e;
@@ -194,7 +195,7 @@
     switch ( op )
     {
     case XENMEM_machphys_mfn_list:
-        if ( copy_from_user(&xmml, arg, sizeof(xmml)) )
+        if ( copy_from_guest(&xmml, arg, 1) )
             return -EFAULT;
 
         for ( i = 0, v = RDWR_MPT_VIRT_START;
@@ -209,11 +210,12 @@
             if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
                 break;
             mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
-            if ( put_user(mfn, &xmml.extent_start[i]) )
+            if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
                 return -EFAULT;
         }
 
-        if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) )
+        xmml.nr_extents = i;
+        if ( copy_to_guest(arg, &xmml, 1) )
             return -EFAULT;
 
         break;
@@ -228,7 +230,7 @@
 
 long do_stack_switch(unsigned long ss, unsigned long esp)
 {
-    fixup_guest_selector(ss);
+    fixup_guest_stack_selector(ss);
     current->arch.guest_context.kernel_ss = ss;
     current->arch.guest_context.kernel_sp = esp;
     return 0;
@@ -315,7 +317,7 @@
 
     /* Validate and fix up the target code selector. */
     cs = a >> 16;
-    fixup_guest_selector(cs);
+    fixup_guest_code_selector(cs);
     if ( !guest_gate_selector_okay(cs) )
         goto bad;
     a = d->a = (d->a & 0xffffU) | (cs << 16);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/dom0_ops.c
--- a/xen/common/dom0_ops.c     Wed Mar  1 17:01:54 2006
+++ b/xen/common/dom0_ops.c     Wed Mar  1 19:47:25 2006
@@ -46,6 +46,7 @@
     struct vcpu   *v;
     u64 cpu_time = 0;
     int flags = DOMFLAGS_BLOCKED;
+    struct vcpu_runstate_info runstate;
     
     info->domain = d->domain_id;
     info->nr_online_vcpus = 0;
@@ -55,7 +56,8 @@
      * - domain is marked as running if any of its vcpus is running
      */
     for_each_vcpu ( d, v ) {
-        cpu_time += v->cpu_time;
+        vcpu_runstate_get(v, &runstate);
+        cpu_time += runstate.time[RUNSTATE_running];
         info->max_vcpu_id = v->vcpu_id;
         if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
         {
@@ -165,7 +167,15 @@
         domid_t        dom;
         struct vcpu   *v;
         unsigned int   i, cnt[NR_CPUS] = { 0 };
+        cpumask_t      cpu_exclude_map;
         static domid_t rover = 0;
+
+        /*
+         * Running the domain 0 kernel in ring 0 is not compatible
+         * with multiple guests.
+         */
+        if ( supervisor_mode_kernel )
+            return -EINVAL;
 
         dom = op->u.createdomain.domain;
         if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) )
@@ -195,18 +205,29 @@
         read_lock(&domlist_lock);
         for_each_domain ( d )
             for_each_vcpu ( d, v )
-                cnt[v->processor]++;
+                if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
+                    cnt[v->processor]++;
         read_unlock(&domlist_lock);
         
         /*
-         * If we're on a HT system, we only use the first HT for dom0, other 
-         * domains will all share the second HT of each CPU. Since dom0 is on 
-         * CPU 0, we favour high numbered CPUs in the event of a tie.
+         * If we're on a HT system, we only auto-allocate to a non-primary HT.
+         * We favour high numbered CPUs in the event of a tie.
          */
-        pro = smp_num_siblings - 1;
-        for ( i = pro; i < num_online_cpus(); i += smp_num_siblings )
+        pro = first_cpu(cpu_sibling_map[0]);
+        if ( cpus_weight(cpu_sibling_map[0]) > 1 )
+            pro = next_cpu(pro, cpu_sibling_map[0]);
+        cpu_exclude_map = cpu_sibling_map[0];
+        for_each_online_cpu ( i )
+        {
+            if ( cpu_isset(i, cpu_exclude_map) )
+                continue;
+            if ( (i == first_cpu(cpu_sibling_map[i])) &&
+                 (cpus_weight(cpu_sibling_map[i]) > 1) )
+                continue;
+            cpus_or(cpu_exclude_map, cpu_exclude_map, cpu_sibling_map[i]);
             if ( cnt[i] <= cnt[pro] )
                 pro = i;
+        }
 
         ret = -ENOMEM;
         if ( (d = domain_create(dom, pro)) == NULL )
@@ -485,6 +506,7 @@
     { 
         struct domain *d;
         struct vcpu   *v;
+        struct vcpu_runstate_info runstate;
 
         ret = -ESRCH;
         if ( (d = find_domain_by_id(op->u.getvcpuinfo.domain)) == NULL )
@@ -498,10 +520,12 @@
         if ( (v = d->vcpu[op->u.getvcpuinfo.vcpu]) == NULL )
             goto getvcpuinfo_out;
 
+        vcpu_runstate_get(v, &runstate);
+
         op->u.getvcpuinfo.online   = !test_bit(_VCPUF_down, &v->vcpu_flags);
         op->u.getvcpuinfo.blocked  = test_bit(_VCPUF_blocked, &v->vcpu_flags);
         op->u.getvcpuinfo.running  = test_bit(_VCPUF_running, &v->vcpu_flags);
-        op->u.getvcpuinfo.cpu_time = v->cpu_time;
+        op->u.getvcpuinfo.cpu_time = runstate.time[RUNSTATE_running];
         op->u.getvcpuinfo.cpu      = v->processor;
         op->u.getvcpuinfo.cpumap   = 0;
         memcpy(&op->u.getvcpuinfo.cpumap,
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/domain.c
--- a/xen/common/domain.c       Wed Mar  1 17:01:54 2006
+++ b/xen/common/domain.c       Wed Mar  1 19:47:25 2006
@@ -451,6 +451,41 @@
     case VCPUOP_is_up:
         rc = !test_bit(_VCPUF_down, &v->vcpu_flags);
         break;
+
+    case VCPUOP_get_runstate_info:
+    {
+        struct vcpu_runstate_info runstate;
+        vcpu_runstate_get(v, &runstate);
+        if ( copy_to_user(arg, &runstate, sizeof(runstate)) )
+            rc = -EFAULT;
+        break;
+    }
+
+    case VCPUOP_register_runstate_memory_area:
+    {
+        struct vcpu_register_runstate_memory_area area;
+
+        rc = -EINVAL;
+        if ( v != current )
+            break;
+
+        rc = -EFAULT;
+        if ( copy_from_user(&area, arg, sizeof(area)) )
+            break;
+
+        if ( !access_ok(area.addr.v, sizeof(*area.addr.v)) )
+            break;
+
+        rc = 0;
+        v->runstate_guest = area.addr.v;
+        __copy_to_user(v->runstate_guest, &v->runstate, sizeof(v->runstate));
+
+        break;
+    }
+
+    default:
+        rc = -ENOSYS;
+        break;
     }
 
     return rc;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/kernel.c
--- a/xen/common/kernel.c       Wed Mar  1 17:01:54 2006
+++ b/xen/common/kernel.c       Wed Mar  1 19:47:25 2006
@@ -195,6 +195,8 @@
                     (1U << XENFEAT_writable_page_tables) |
                     (1U << XENFEAT_auto_translated_physmap) |
                     (1U << XENFEAT_pae_pgdir_above_4gb);
+            if ( supervisor_mode_kernel )
+                fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
             break;
         default:
             return -EINVAL;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/keyhandler.c
--- a/xen/common/keyhandler.c   Wed Mar  1 17:01:54 2006
+++ b/xen/common/keyhandler.c   Wed Mar  1 19:47:25 2006
@@ -169,8 +169,6 @@
 }
 
 extern void dump_runq(unsigned char key);
-extern void print_sched_histo(unsigned char key);
-extern void reset_sched_histo(unsigned char key);
 #ifndef NDEBUG
 extern void audit_domains_key(unsigned char key);
 #endif
@@ -206,10 +204,6 @@
         'd', dump_registers, "dump registers"); 
     register_keyhandler(
         'h', show_handlers, "show this message");
-    register_keyhandler(
-        'l', print_sched_histo, "print sched latency histogram");
-    register_keyhandler(
-        'L', reset_sched_histo, "reset sched latency histogram");
     register_keyhandler(
         'q', dump_domains, "dump domain (and guest debug) info");
     register_keyhandler(
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/memory.c
--- a/xen/common/memory.c       Wed Mar  1 17:01:54 2006
+++ b/xen/common/memory.c       Wed Mar  1 19:47:25 2006
@@ -16,6 +16,7 @@
 #include <xen/event.h>
 #include <xen/shadow.h>
 #include <xen/iocap.h>
+#include <xen/guest_access.h>
 #include <asm/current.h>
 #include <asm/hardirq.h>
 #include <public/memory.h>
@@ -30,7 +31,7 @@
 static long
 increase_reservation(
     struct domain *d, 
-    unsigned long *extent_list, 
+    GUEST_HANDLE(xen_ulong) extent_list,
     unsigned int   nr_extents,
     unsigned int   extent_order,
     unsigned int   flags,
@@ -39,8 +40,8 @@
     struct page_info *page;
     unsigned long     i, mfn;
 
-    if ( (extent_list != NULL) &&
-         !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
+    if ( !guest_handle_is_null(extent_list) &&
+         !guest_handle_okay(extent_list, nr_extents) )
         return 0;
 
     if ( (extent_order != 0) &&
@@ -65,10 +66,10 @@
         }
 
         /* Inform the domain of the new page's machine address. */ 
-        if ( extent_list != NULL )
+        if ( !guest_handle_is_null(extent_list) )
         {
             mfn = page_to_mfn(page);
-            if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) )
+            if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) )
                 return i;
         }
     }
@@ -79,16 +80,16 @@
 static long
 populate_physmap(
     struct domain *d, 
-    unsigned long *extent_list, 
-    unsigned int   nr_extents,
-    unsigned int   extent_order,
-    unsigned int   flags,
-    int           *preempted)
+    GUEST_HANDLE(xen_ulong) extent_list,
+    unsigned int  nr_extents,
+    unsigned int  extent_order,
+    unsigned int  flags,
+    int          *preempted)
 {
     struct page_info *page;
     unsigned long    i, j, gpfn, mfn;
 
-    if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
+    if ( !guest_handle_okay(extent_list, nr_extents) )
         return 0;
 
     if ( (extent_order != 0) &&
@@ -103,7 +104,7 @@
             goto out;
         }
 
-        if ( unlikely(__copy_from_user(&gpfn, &extent_list[i], sizeof(gpfn))) )
+        if ( unlikely(__copy_from_guest_offset(&gpfn, extent_list, i, 1)) )
             goto out;
 
         if ( unlikely((page = alloc_domheap_pages(
@@ -128,7 +129,7 @@
                 set_gpfn_from_mfn(mfn + j, gpfn + j);
 
             /* Inform the domain of the new page's machine address. */ 
-            if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) )
+            if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) )
                 goto out;
         }
     }
@@ -139,8 +140,8 @@
     
 static long
 decrease_reservation(
-    struct domain *d, 
-    unsigned long *extent_list, 
+    struct domain *d,
+    GUEST_HANDLE(xen_ulong) extent_list,
     unsigned int   nr_extents,
     unsigned int   extent_order,
     unsigned int   flags,
@@ -149,7 +150,7 @@
     struct page_info *page;
     unsigned long    i, j, gmfn, mfn;
 
-    if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
+    if ( !guest_handle_okay(extent_list, nr_extents) )
         return 0;
 
     for ( i = 0; i < nr_extents; i++ )
@@ -160,7 +161,7 @@
             return i;
         }
 
-        if ( unlikely(__copy_from_user(&gmfn, &extent_list[i], sizeof(gmfn))) )
+        if ( unlikely(__copy_from_guest_offset(&gmfn, extent_list, i, 1)) )
             return i;
 
         for ( j = 0; j < (1 << extent_order); j++ )
@@ -197,21 +198,21 @@
 
 static long
 translate_gpfn_list(
-    struct xen_translate_gpfn_list *uop, unsigned long *progress)
+    GUEST_HANDLE(xen_translate_gpfn_list_t) uop, unsigned long *progress)
 {
     struct xen_translate_gpfn_list op;
     unsigned long i, gpfn, mfn;
     struct domain *d;
 
-    if ( copy_from_user(&op, uop, sizeof(op)) )
+    if ( copy_from_guest(&op, uop, 1) )
         return -EFAULT;
 
     /* Is size too large for us to encode a continuation? */
     if ( op.nr_gpfns > (ULONG_MAX >> START_EXTENT_SHIFT) )
         return -EINVAL;
 
-    if ( !array_access_ok(op.gpfn_list, op.nr_gpfns, sizeof(*op.gpfn_list)) ||
-         !array_access_ok(op.mfn_list, op.nr_gpfns, sizeof(*op.mfn_list)) )
+    if ( !guest_handle_okay(op.gpfn_list, op.nr_gpfns) ||
+         !guest_handle_okay(op.mfn_list,  op.nr_gpfns) )
         return -EFAULT;
 
     if ( op.domid == DOMID_SELF )
@@ -237,8 +238,7 @@
             return -EAGAIN;
         }
 
-        if ( unlikely(__copy_from_user(&gpfn, &op.gpfn_list[i],
-                                       sizeof(gpfn))) )
+        if ( unlikely(__copy_from_guest_offset(&gpfn, op.gpfn_list, i, 1)) )
         {
             put_domain(d);
             return -EFAULT;
@@ -246,8 +246,7 @@
 
         mfn = gmfn_to_mfn(d, gpfn);
 
-        if ( unlikely(__copy_to_user(&op.mfn_list[i], &mfn,
-                                     sizeof(mfn))) )
+        if ( unlikely(__copy_to_guest_offset(op.mfn_list, i, &mfn, 1)) )
         {
             put_domain(d);
             return -EFAULT;
@@ -258,7 +257,7 @@
     return 0;
 }
 
-long do_memory_op(unsigned long cmd, void *arg)
+long do_memory_op(unsigned long cmd, GUEST_HANDLE(void) arg)
 {
     struct domain *d;
     int rc, op, flags = 0, preempted = 0;
@@ -273,7 +272,7 @@
     case XENMEM_increase_reservation:
     case XENMEM_decrease_reservation:
     case XENMEM_populate_physmap:
-        if ( copy_from_user(&reservation, arg, sizeof(reservation)) )
+        if ( copy_from_guest(&reservation, arg, 1) )
             return -EFAULT;
 
         /* Is size too large for us to encode a continuation? */
@@ -283,9 +282,9 @@
         start_extent = cmd >> START_EXTENT_SHIFT;
         if ( unlikely(start_extent > reservation.nr_extents) )
             return -EINVAL;
-        
-        if ( reservation.extent_start != NULL )
-            reservation.extent_start += start_extent;
+
+        if ( !guest_handle_is_null(reservation.extent_start) )
+            guest_handle_add_offset(reservation.extent_start, start_extent);
         reservation.nr_extents -= start_extent;
 
         if ( (reservation.address_bits != 0) &&
@@ -342,8 +341,9 @@
         rc += start_extent;
 
         if ( preempted )
-            return hypercall2_create_continuation(
-                __HYPERVISOR_memory_op, op | (rc << START_EXTENT_SHIFT), arg);
+            return hypercall_create_continuation(
+                __HYPERVISOR_memory_op, "lh",
+                op | (rc << START_EXTENT_SHIFT), arg);
 
         break;
 
@@ -353,10 +353,10 @@
 
     case XENMEM_current_reservation:
     case XENMEM_maximum_reservation:
-        if ( copy_from_user(&domid, (domid_t *)arg, sizeof(domid)) )
+        if ( copy_from_guest(&domid, arg, 1) )
             return -EFAULT;
 
-        if ( likely((domid = (unsigned long)arg) == DOMID_SELF) )
+        if ( likely(domid == DOMID_SELF) )
             d = current->domain;
         else if ( !IS_PRIV(current->domain) )
             return -EPERM;
@@ -372,12 +372,13 @@
 
     case XENMEM_translate_gpfn_list:
         progress = cmd >> START_EXTENT_SHIFT;
-        rc = translate_gpfn_list(arg, &progress);
+        rc = translate_gpfn_list(
+            guest_handle_cast(arg, xen_translate_gpfn_list_t),
+            &progress);
         if ( rc == -EAGAIN )
-            return hypercall2_create_continuation(
-                __HYPERVISOR_memory_op,
-                op | (progress << START_EXTENT_SHIFT),
-                arg);
+            return hypercall_create_continuation(
+                __HYPERVISOR_memory_op, "lh",
+                op | (progress << START_EXTENT_SHIFT), arg);
         break;
 
     default:
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/multicall.c
--- a/xen/common/multicall.c    Wed Mar  1 17:01:54 2006
+++ b/xen/common/multicall.c    Wed Mar  1 19:47:25 2006
@@ -81,8 +81,8 @@
             if ( i < nr_calls )
             {
                 mcs->flags = 0;
-                return hypercall2_create_continuation(
-                    __HYPERVISOR_multicall, &call_list[i], nr_calls-i);
+                return hypercall_create_continuation(
+                    __HYPERVISOR_multicall, "pi", &call_list[i], nr_calls-i);
             }
         }
     }
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Wed Mar  1 17:01:54 2006
+++ b/xen/common/page_alloc.c   Wed Mar  1 19:47:25 2006
@@ -32,6 +32,7 @@
 #include <xen/softirq.h>
 #include <xen/shadow.h>
 #include <xen/domain_page.h>
+#include <xen/keyhandler.h>
 #include <asm/page.h>
 
 /*
@@ -662,6 +663,26 @@
 }
 
 
+static void pagealloc_keyhandler(unsigned char key)
+{
+    printk("Physical memory information:\n");
+    printk("    Xen heap: %lukB free\n"
+           "    DMA heap: %lukB free\n"
+           "    Dom heap: %lukB free\n",
+           avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
+           avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
+           avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+}
+
+
+static __init int pagealloc_keyhandler_init(void)
+{
+    register_keyhandler('m', pagealloc_keyhandler, "memory info");
+    return 0;
+}
+__initcall(pagealloc_keyhandler_init);
+
+
 
 /*************************
  * PAGE SCRUBBING
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/sched_bvt.c
--- a/xen/common/sched_bvt.c    Wed Mar  1 17:01:54 2006
+++ b/xen/common/sched_bvt.c    Wed Mar  1 19:47:25 2006
@@ -132,13 +132,13 @@
     vcpu_schedule_unlock_irq(v);
 }
 
-static inline u32 calc_avt(struct vcpu *d, s_time_t now)
+static inline u32 calc_avt(struct vcpu *v, s_time_t now)
 {
     u32 ranfor, mcus;
-    struct bvt_dom_info *inf = BVT_INFO(d->domain);
-    struct bvt_vcpu_info *einf = EBVT_INFO(d);
-    
-    ranfor = (u32)(now - d->lastschd);
+    struct bvt_dom_info *inf = BVT_INFO(v->domain);
+    struct bvt_vcpu_info *einf = EBVT_INFO(v);
+    
+    ranfor = (u32)(now - v->runstate.state_entry_time);
     mcus = (ranfor + MCU - 1)/MCU;
 
     return einf->avt + mcus * inf->mcu_advance;
@@ -262,7 +262,7 @@
     curr_evt = calc_evt(curr, calc_avt(curr, now));
     /* Calculate the time the current domain would run assuming
        the second smallest evt is of the newly woken domain */
-    r_time = curr->lastschd +
+    r_time = curr->runstate.state_entry_time +
         ((einf->evt - curr_evt) / BVT_INFO(curr->domain)->mcu_advance) +
         ctx_allow;
 
@@ -558,7 +558,6 @@
         printk("%3d: %u has=%c ", loop++, v->domain->domain_id,
                test_bit(_VCPUF_running, &v->vcpu_flags) ? 'T':'F');
         bvt_dump_runq_el(v);
-        printk("c=0x%X%08X\n", (u32)(v->cpu_time>>32), (u32)v->cpu_time);
         printk("         l: %p n: %p  p: %p\n",
                &vcpu_inf->run_list, vcpu_inf->run_list.next,
                vcpu_inf->run_list.prev);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c   Wed Mar  1 17:01:54 2006
+++ b/xen/common/sched_sedf.c   Wed Mar  1 19:47:25 2006
@@ -1408,18 +1408,14 @@
 {
     printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id,
            test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F');
-    printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64
+    printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu"
            " sc=%i xtr(%s)=%"PRIu64" ew=%hu",
            EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs,
-           EDOM_INFO(d)->weight, d->cpu_time,
+           EDOM_INFO(d)->weight,
            EDOM_INFO(d)->score[EXTRA_UTIL_Q],
            (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no",
            EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight);
     
-    if ( d->cpu_time != 0 )
-        printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100)
-               / d->cpu_time);
-
 #ifdef SEDF_STATS
     if ( EDOM_INFO(d)->block_time_tot != 0 )
         printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) /
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/schedule.c
--- a/xen/common/schedule.c     Wed Mar  1 17:01:54 2006
+++ b/xen/common/schedule.c     Wed Mar  1 19:47:25 2006
@@ -36,14 +36,6 @@
 static char opt_sched[10] = "sedf";
 string_param("sched", opt_sched);
 
-/*#define WAKE_HISTO*/
-/*#define BLOCKTIME_HISTO*/
-#if defined(WAKE_HISTO)
-#define BUCKETS 31
-#elif defined(BLOCKTIME_HISTO)
-#define BUCKETS 200
-#endif
-
 #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
 
 /* Various timer handlers. */
@@ -73,6 +65,36 @@
 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
 static struct timer t_timer[NR_CPUS]; 
 
+static inline void vcpu_runstate_change(
+    struct vcpu *v, int new_state, s_time_t new_entry_time)
+{
+    ASSERT(v->runstate.state != new_state);
+    ASSERT(spin_is_locked(&schedule_data[v->processor].schedule_lock));
+
+    v->runstate.time[v->runstate.state] +=
+        new_entry_time - v->runstate.state_entry_time;
+    v->runstate.state_entry_time = new_entry_time;
+    v->runstate.state = new_state;
+}
+
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
+{
+    if ( likely(v == current) )
+    {
+        /* Fast lock-free path. */
+        memcpy(runstate, &v->runstate, sizeof(*runstate));
+        ASSERT(runstate->state == RUNSTATE_running);
+        runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
+    }
+    else
+    {
+        vcpu_schedule_lock_irq(v);
+        memcpy(runstate, &v->runstate, sizeof(*runstate));
+        runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
+        vcpu_schedule_unlock_irq(v);
+    }
+}
+
 struct domain *alloc_domain(void)
 {
     struct domain *d;
@@ -119,6 +141,9 @@
     v->cpu_affinity = is_idle_domain(d) ?
         cpumask_of_cpu(cpu_id) : CPU_MASK_ALL;
 
+    v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
+    v->runstate.state_entry_time = NOW();
+
     if ( (vcpu_id != 0) && !is_idle_domain(d) )
         set_bit(_VCPUF_down, &v->vcpu_flags);
 
@@ -165,8 +190,15 @@
     unsigned long flags;
 
     vcpu_schedule_lock_irqsave(v, flags);
+
     if ( likely(!vcpu_runnable(v)) )
+    {
+        if ( v->runstate.state == RUNSTATE_runnable )
+            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+
         SCHED_OP(sleep, v);
+    }
+
     vcpu_schedule_unlock_irqrestore(v, flags);
 
     TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
@@ -187,11 +219,19 @@
     unsigned long flags;
 
     vcpu_schedule_lock_irqsave(v, flags);
+
     if ( likely(vcpu_runnable(v)) )
     {
+        if ( v->runstate.state >= RUNSTATE_blocked )
+            vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
         SCHED_OP(wake, v);
-        v->wokenup = NOW();
-    }
+    }
+    else if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) )
+    {
+        if ( v->runstate.state == RUNSTATE_blocked )
+            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+    }
+
     vcpu_schedule_unlock_irqrestore(v, flags);
 
     TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
@@ -376,8 +416,6 @@
 
     stop_timer(&schedule_data[cpu].s_timer);
     
-    prev->cpu_time += now - prev->lastschd;
-
     /* get policy-specific decision on scheduling... */
     next_slice = ops.do_schedule(now);
 
@@ -386,8 +424,6 @@
 
     schedule_data[cpu].curr = next;
     
-    next->lastschd = now;
-
     set_timer(&schedule_data[cpu].s_timer, now + r_time);
 
     if ( unlikely(prev == next) )
@@ -397,38 +433,23 @@
     }
 
     TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
-             prev->domain->domain_id, now - prev->lastschd);
+             prev->domain->domain_id,
+             now - prev->runstate.state_entry_time);
     TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
-             next->domain->domain_id, now - next->wokenup, r_time);
-
-    /*
-     * Logic of wokenup field in domain struct:
-     * Used to calculate "waiting time", which is the time that a domain
-     * spends being "runnable", but not actually running. wokenup is set
-     * set whenever a domain wakes from sleeping. However, if wokenup is not
-     * also set here then a preempted runnable domain will get a screwed up
-     * "waiting time" value next time it is scheduled.
-     */
-    prev->wokenup = now;
-
-#if defined(WAKE_HISTO)
-    if ( !is_idle_vcpu(next) && next->wokenup )
-    {
-        ulong diff = (ulong)(now - next->wokenup);
-        diff /= (ulong)MILLISECS(1);
-        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
-        else                    schedule_data[cpu].hist[BUCKETS-1]++;
-    }
-    next->wokenup = (s_time_t)0;
-#elif defined(BLOCKTIME_HISTO)
-    prev->lastdeschd = now;
-    if ( !is_idle_vcpu(next) )
-    {
-        ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
-        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
-        else                    schedule_data[cpu].hist[BUCKETS-1]++;
-    }
-#endif
+             next->domain->domain_id,
+             (next->runstate.state == RUNSTATE_runnable) ?
+             (now - next->runstate.state_entry_time) : 0,
+             r_time);
+
+    ASSERT(prev->runstate.state == RUNSTATE_running);
+    vcpu_runstate_change(
+        prev,
+        (test_bit(_VCPUF_blocked, &prev->vcpu_flags) ? RUNSTATE_blocked :
+         (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
+        now);
+
+    ASSERT(next->runstate.state != RUNSTATE_running);
+    vcpu_runstate_change(next, RUNSTATE_running, now);
 
     ASSERT(!test_bit(_VCPUF_running, &next->vcpu_flags));
     set_bit(_VCPUF_running, &next->vcpu_flags);
@@ -567,47 +588,6 @@
 
     local_irq_restore(flags);
 }
-
-#if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
-
-void print_sched_histo(unsigned char key)
-{
-    int i, j, k;
-    for_each_online_cpu ( k )
-    {
-        j = 0;
-        printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
-        for ( i = 0; i < BUCKETS; i++ )
-        {
-            if ( schedule_data[k].hist[i] != 0 )
-            {
-                if ( i < BUCKETS-1 )
-                    printk("%2d:[%7u]    ", i, schedule_data[k].hist[i]);
-                else
-                    printk(" >:[%7u]    ", schedule_data[k].hist[i]);
-                if ( !(++j % 5) )
-                    printk("\n");
-            }
-        }
-        printk("\n");
-    }
-      
-}
-
-void reset_sched_histo(unsigned char key)
-{
-    int i, j;
-    for ( j = 0; j < NR_CPUS; j++ )
-        for ( i=0; i < BUCKETS; i++ ) 
-            schedule_data[j].hist[i] = 0;
-}
-
-#else
-
-void print_sched_histo(unsigned char key) { }
-void reset_sched_histo(unsigned char key) { }
-
-#endif
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/drivers/char/console.c
--- a/xen/drivers/char/console.c        Wed Mar  1 17:01:54 2006
+++ b/xen/drivers/char/console.c        Wed Mar  1 19:47:25 2006
@@ -335,8 +335,9 @@
         }
 
         if ( hypercall_preempt_check() )
-            return hypercall3_create_continuation(
-                __HYPERVISOR_console_io, CONSOLEIO_write, count, buffer);
+            return hypercall_create_continuation(
+                __HYPERVISOR_console_io, "iip",
+                CONSOLEIO_write, count, buffer);
 
         kcount = min_t(int, count, sizeof(kbuf)-1);
         if ( copy_from_user(kbuf, buffer, kcount) )
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/config.h
--- a/xen/include/asm-ia64/config.h     Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-ia64/config.h     Wed Mar  1 19:47:25 2006
@@ -36,6 +36,8 @@
 //#define CONFIG_NR_CPUS 16
 //leave SMP for a later time
 //#undef CONFIG_SMP
+
+#define supervisor_mode_kernel (0)
 
 #define MAX_DMADOM_PFN (0x7FFFFFFFUL >> PAGE_SHIFT) /* 31 addressable bits */
 
@@ -190,11 +192,6 @@
 
 #define find_first_set_bit(x)  (ffs(x)-1)      // FIXME: Is this right???
 
-// from include/asm-x86/*/uaccess.h
-#define array_access_ok(addr,count,size)                       \
-    (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ &&  \
-     access_ok(type,addr,count*size))
-
 // see drivers/char/console.c
 #ifndef VALIDATE_VT
 #define        OPT_CONSOLE_STR "com1"
@@ -299,7 +296,6 @@
 //#define raw_smp_processor_id()       0
 //#endif
 
-
 #ifndef __ASSEMBLY__
 #include <linux/linkage.h>
 #define FORCE_CRASH()  asm("break.m 0;;");
diff -r 88f97bb8f3ae -r 673f62edbfbe 
xen/include/asm-ia64/linux-xen/asm/README.origin
--- a/xen/include/asm-ia64/linux-xen/asm/README.origin  Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-ia64/linux-xen/asm/README.origin  Wed Mar  1 19:47:25 2006
@@ -22,4 +22,3 @@
 system.h               -> linux/include/asm-ia64/system.h
 tlbflush.h             -> linux/include/asm-ia64/tlbflush.h
 types.h                        -> linux/include/asm-ia64/types.h
-uaccess.h              -> linux/include/asm-ia64/uaccess.h
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/config.h      Wed Mar  1 19:47:25 2006
@@ -36,6 +36,12 @@
 #define OPT_CONSOLE_STR "com1,vga"
 
 #define NR_CPUS 32
+
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+# define supervisor_mode_kernel (1)
+#else
+# define supervisor_mode_kernel (0)
+#endif
 
 /* Linkage for x86 */
 #define __ALIGN .align 16,0x90
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/desc.h
--- a/xen/include/asm-x86/desc.h        Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/desc.h        Wed Mar  1 19:47:25 2006
@@ -27,9 +27,22 @@
 #endif
 
 /* Fix up the RPL of a guest segment selector. */
-#define fixup_guest_selector(sel)                               \
+#define __fixup_guest_selector(sel)                             \
     ((sel) = (((sel) & 3) >= GUEST_KERNEL_RPL) ? (sel) :        \
      (((sel) & ~3) | GUEST_KERNEL_RPL))
+
+/* Stack selectors don't need fixing up if the kernel runs in ring 0. */
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+#define fixup_guest_stack_selector(ss) ((void)0)
+#else
+#define fixup_guest_stack_selector(ss) __fixup_guest_selector(ss)
+#endif
+
+/*
+ * Code selectors are always fixed up. It allows the Xen exit stub to detect
+ * return to guest context, even when the guest kernel runs in ring 0.
+ */
+#define fixup_guest_code_selector(cs)  __fixup_guest_selector(cs)
 
 /*
  * We need this function because enforcing the correct guest kernel RPL is
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/hvm.h     Wed Mar  1 19:47:25 2006
@@ -67,6 +67,9 @@
     int (*paging_enabled)(struct vcpu *v);
     int (*instruction_length)(struct vcpu *v);
     unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
+
+    void (*init_ap_context)(struct vcpu_guest_context *ctxt,
+                            int vcpuid, int trampoline_vector);
 };
 
 extern struct hvm_function_table hvm_funcs;
@@ -173,4 +176,14 @@
         return hvm_funcs.get_guest_ctrl_reg(v, num);
     return 0;                   /* force to fail */
 }
+
+static inline void
+hvm_init_ap_context(struct vcpu_guest_context *ctxt,
+                    int vcpuid, int trampoline_vector)
+{
+    return hvm_funcs.init_ap_context(ctxt, vcpuid, trampoline_vector);
+}
+
+extern int hvm_bringup_ap(int vcpuid, int trampoline_vector);
+
 #endif /* __ASM_X86_HVM_HVM_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/emulate.h
--- a/xen/include/asm-x86/hvm/svm/emulate.h     Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/svm/emulate.h     Wed Mar  1 19:47:25 2006
@@ -83,15 +83,15 @@
         struct cpu_user_regs *regs, const u8 prefix, const u8 *operand, 
         u8 *size);
 extern OPERATING_MODE get_operating_mode (struct vmcb_struct *vmcb);
-extern unsigned int decode_dest_reg(u8 modrm);
-extern unsigned int decode_src_reg(u8 modrm);
+extern unsigned int decode_dest_reg(u8 prefix, u8 modrm);
+extern unsigned int decode_src_reg(u8 prefix, u8 modrm);
 extern unsigned long svm_rip2pointer(struct vmcb_struct *vmcb);
-extern unsigned int __get_instruction_length_from_list(struct vmcb_struct 
*vmcb,
+extern int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
         enum instruction_index *list, unsigned int list_count, 
         u8 *guest_eip_buf, enum instruction_index *match);
 
 
-static inline unsigned int __get_instruction_length(struct vmcb_struct *vmcb, 
+static inline int __get_instruction_length(struct vmcb_struct *vmcb, 
         enum instruction_index instr, u8 *guest_eip_buf)
 {
     return __get_instruction_length_from_list(vmcb, &instr, 1, guest_eip_buf, 
@@ -138,9 +138,20 @@
 }
 
 
+static inline int skip_prefix_bytes(u8 *buf, size_t size)
+{
+    int index;
+    for (index = 0; index < size && is_prefix(buf[index]); index ++)  
+        /* do nothing */ ;
+    return index;
+}
+
+
+
 static void inline __update_guest_eip(struct vmcb_struct *vmcb, 
-        unsigned long inst_len) 
+        int inst_len) 
 {
+    ASSERT(inst_len > 0);
     vmcb->rip += inst_len;
 }
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/svm.h
--- a/xen/include/asm-x86/hvm/svm/svm.h Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/svm/svm.h Wed Mar  1 19:47:25 2006
@@ -54,6 +54,8 @@
 /* For debugging. Remove when no longer needed. */
 extern void svm_dump_host_regs(const char *from);
 
+extern void svm_migrate_timers(struct vcpu *v);
+
 /* ASID API */
 enum {
     ASID_AVAILABLE = 0,
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/vmcb.h
--- a/xen/include/asm-x86/hvm/svm/vmcb.h        Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h        Wed Mar  1 19:47:25 2006
@@ -269,21 +269,6 @@
 #define SVM_LONG_GUEST(ed)    \
   (test_bit(SVM_CPU_STATE_LMA_ENABLED, &ed->arch.hvm_svm.cpu_state))
 
-enum {
-    SVM_INDEX_MSR_LSTAR = 0,
-    SVM_INDEX_MSR_STAR,
-    SVM_INDEX_MSR_CSTAR,
-    SVM_INDEX_MSR_SYSCALL_MASK,
-    SVM_INDEX_MSR_EFER,
-
-    SVM_MSR_COUNT,
-};
-
-struct svm_msr_state {
-    unsigned long flags;
-    unsigned long msr_items[SVM_MSR_COUNT];
-    unsigned long shadow_gs;
-};
 
 /* 
  * Attribute for segment selector. This is a copy of bit 40:47 & 52:55 of the
@@ -449,7 +434,7 @@
 
 struct arch_svm_struct {
     struct vmcb_struct *vmcb;
-    void               *host_save_area;
+    void                       *host_save_area;
     u64                 host_save_pa;
     u64                 vmcb_pa;
     u32                 *iopm;
@@ -457,14 +442,15 @@
     u64                 vmexit_tsc; /* tsc read at #VMEXIT. for TSC_OFFSET */
     int                 injecting_event;
     int                 saved_irq_vector;
-    u32                 core;        /* cpu of last vmexit */
+    u32                 launch_core;
+    u32                 asid_core;
     
     unsigned long       flags;      /* VMCB flags */
-    unsigned long       cpu_shadow_cr0; /* copy of guest read shadow CR0 */
+    unsigned long       cpu_shadow_cr0; /* Guest value for CR0 */
+    unsigned long       cpu_shadow_cr4; /* Guest value for CR4 */
     unsigned long       cpu_cr2;
     unsigned long       cpu_cr3;
     unsigned long       cpu_state;
-    struct svm_msr_state msr_content;
     struct timer        hlt_timer;  /* hlt ins emulation wakeup timer */
 };
 
@@ -485,6 +471,14 @@
 
 #define VMCB_EFLAGS_RESERVED_0          0xffc08028 /* bitmap for 0 */
 #define VMCB_EFLAGS_RESERVED_1          0x00000002 /* bitmap for 1 */
+
+/* These bits in the CR4 are owned by the host */
+#ifdef __i386__
+#define SVM_CR4_HOST_MASK (0)
+#else
+#define SVM_CR4_HOST_MASK (X86_CR4_PAE)
+#endif
+
 
 #endif /* ASM_X86_HVM_SVM_VMCS_H__ */
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h    Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/vcpu.h    Wed Mar  1 19:47:25 2006
@@ -25,10 +25,15 @@
 #include <asm/hvm/vmx/vmcs.h>
 #include <asm/hvm/svm/vmcb.h>
 
+#define HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM          0
+#define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
+
 struct hvm_vcpu {
-    unsigned long       ioflags;
-    struct mmio_op      mmio_op;
-    struct vlapic       *vlapic;
+    unsigned long   ioflags;
+    struct mmio_op  mmio_op;
+    struct vlapic   *vlapic;
+    /* For AP startup */
+    unsigned long   init_sipi_sipi_state;
 
     union {
         struct arch_vmx_struct vmx;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/vlapic.h
--- a/xen/include/asm-x86/hvm/vlapic.h  Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/vlapic.h  Wed Mar  1 19:47:25 2006
@@ -158,9 +158,6 @@
     int deliver_mode;
     int source[6];
 } direct_intr_info_t;
-
-#define VLAPIC_INIT_SIPI_SIPI_STATE_NORM          0
-#define VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
 
 struct vlapic
 {
@@ -197,7 +194,6 @@
     unsigned long      init_ticks;
     uint32_t           err_write_count;
     uint64_t           apic_base_msr;
-    uint32_t           init_sipi_sipi_state;
     struct vcpu        *vcpu;
     struct domain      *domain;
 };
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/mm.h  Wed Mar  1 19:47:25 2006
@@ -337,6 +337,10 @@
         UNLOCK_BIGLOCK(d);                                      \
     } while ( 0 )
 
+#define writable_pagetable_in_sync(d)           \
+    (!((d)->arch.ptwr[PTWR_PT_ACTIVE].l1va |    \
+       (d)->arch.ptwr[PTWR_PT_INACTIVE].l1va))
+
 int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
 
 #ifndef NDEBUG
@@ -376,7 +380,7 @@
 int __sync_lazy_execstate(void);
 
 /* Arch-specific portion of memory_op hypercall. */
-long arch_memory_op(int op, void *arg);
-long subarch_memory_op(int op, void *arg);
+long arch_memory_op(int op, GUEST_HANDLE(void) arg);
+long subarch_memory_op(int op, GUEST_HANDLE(void) arg);
 
 #endif /* __ASM_X86_MM_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/shadow_64.h
--- a/xen/include/asm-x86/shadow_64.h   Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/shadow_64.h   Wed Mar  1 19:47:25 2006
@@ -223,6 +223,7 @@
     int i;
     pgentry_64_t *le_e;
     pgentry_64_t *le_p = NULL;
+    pgentry_64_t *phys_vtable = NULL;
     unsigned long mfn;
     int index;
     u32 level = flag & L_MASK;
@@ -251,25 +252,35 @@
     {
         root_level = PAE_PAGING_LEVELS;
         index = table_offset_64(va, root_level);
-        le_e = (pgentry_64_t *)map_domain_page(
+        phys_vtable = (pgentry_64_t *)map_domain_page(
             pagetable_get_pfn(v->domain->arch.phys_table));
+        le_e = &phys_vtable[index];
     }
 
     /*
      * If it's not external mode, then mfn should be machine physical.
      */
-    for (i = root_level - level; i > 0; i--) {
-        if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) ) {
+    for ( i = root_level - level; i > 0; i-- )
+    {
+        if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) )
+        {
             if ( le_p )
                 unmap_domain_page(le_p);
+
+            if ( phys_vtable )
+                unmap_domain_page(phys_vtable);
+
             return 0;
         }
+
         mfn = entry_get_pfn(*le_e);
         if ( (flag & GUEST_ENTRY) && shadow_mode_translate(d) )
             mfn = get_mfn_from_gpfn(mfn);
+
         if ( le_p )
             unmap_domain_page(le_p);
         le_p = (pgentry_64_t *)map_domain_page(mfn);
+
         if ( flag & SHADOW_ENTRY )
             index = table_offset_64(va, (level + i - 1));
         else
@@ -285,8 +296,10 @@
     if ( le_p )
         unmap_domain_page(le_p);
 
+    if ( phys_vtable )
+        unmap_domain_page(phys_vtable);
+
     return 1;
-
 }
 
 static inline int __rw_entry(
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/shadow_public.h
--- a/xen/include/asm-x86/shadow_public.h       Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/shadow_public.h       Wed Mar  1 19:47:25 2006
@@ -21,8 +21,6 @@
 
 #ifndef _XEN_SHADOW_PUBLIC_H
 #define _XEN_SHADOW_PUBLIC_H
-
-extern int alloc_p2m_table(struct domain *d);
 
 #if CONFIG_PAGING_LEVELS >= 3
 #define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/x86_32/asm_defns.h
--- a/xen/include/asm-x86/x86_32/asm_defns.h    Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/x86_32/asm_defns.h    Wed Mar  1 19:47:25 2006
@@ -48,9 +48,24 @@
 
 #ifdef PERF_COUNTERS
 #define PERFC_INCR(_name,_idx)                          \
-    lock incl perfcounters+_name(,_idx,4)
+        lock incl perfcounters+_name(,_idx,4)
 #else
 #define PERFC_INCR(_name,_idx)
+#endif
+
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+#define FIXUP_RING0_GUEST_STACK                         \
+        testl $2,8(%esp);                               \
+        jnz 1f; /* rings 2 & 3 permitted */             \
+        testl $1,8(%esp);                               \
+        jz 2f;                                          \
+        ud2; /* ring 1 should not be used */            \
+        2:cmpl $(__HYPERVISOR_VIRT_START),%esp;         \
+        jge 1f;                                         \
+        call fixup_ring0_guest_stack;                   \
+        1:
+#else
+#define FIXUP_RING0_GUEST_STACK
 #endif
 
 #define BUILD_SMP_INTERRUPT(x,v) XBUILD_SMP_INTERRUPT(x,v)
@@ -61,6 +76,7 @@
     ".globl " STR(x) "\n\t"                     \
     STR(x) ":\n\t"                              \
     "pushl $"#v"<<16\n\t"                       \
+    STR(FIXUP_RING0_GUEST_STACK)                \
     STR(SAVE_ALL(a))                            \
     "movl %esp,%eax\n\t"                        \
     "pushl %eax\n\t"                            \
@@ -72,6 +88,7 @@
 __asm__(                                        \
     "\n" __ALIGN_STR"\n"                        \
     "common_interrupt:\n\t"                     \
+    STR(FIXUP_RING0_GUEST_STACK)                \
     STR(SAVE_ALL(a))                            \
     "movl %esp,%eax\n\t"                        \
     "pushl %eax\n\t"                            \
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/memory.h
--- a/xen/include/public/memory.h       Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/memory.h       Wed Mar  1 19:47:25 2006
@@ -29,7 +29,7 @@
      *   OUT: GMFN bases of extents that were allocated
      *   (NB. This command also updates the mach_to_phys translation table)
      */
-    unsigned long *extent_start;
+    GUEST_HANDLE(xen_ulong) extent_start;
 
     /* Number of extents, and size/alignment of each (2^extent_order pages). */
     unsigned long  nr_extents;
@@ -50,6 +50,7 @@
     domid_t        domid;
 
 } xen_memory_reservation_t;
+DEFINE_GUEST_HANDLE(xen_memory_reservation_t);
 
 /*
  * Returns the maximum machine frame number of mapped RAM in this system.
@@ -85,7 +86,7 @@
      * any large discontiguities in the machine address space, 2MB gaps in
      * the machphys table will be represented by an MFN base of zero.
      */
-    unsigned long *extent_start;
+    GUEST_HANDLE(xen_ulong) extent_start;
 
     /*
      * Number of extents written to the above array. This will be smaller
@@ -93,6 +94,7 @@
      */
     unsigned int nr_extents;
 } xen_machphys_mfn_list_t;
+DEFINE_GUEST_HANDLE(xen_machphys_mfn_list_t);
 
 /*
  * Returns the base and size of the specified reserved 'RAM hole' in the
@@ -113,6 +115,7 @@
     /* Base and size of the specified reserved area. */
     unsigned long first_gpfn, nr_gpfns;
 } xen_reserved_phys_area_t;
+DEFINE_GUEST_HANDLE(xen_reserved_phys_area_t);
 
 /*
  * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
@@ -127,14 +130,15 @@
     unsigned long nr_gpfns;
 
     /* List of GPFNs to translate. */
-    unsigned long *gpfn_list;
+    GUEST_HANDLE(xen_ulong) gpfn_list;
 
     /*
      * Output list to contain MFN translations. May be the same as the input
      * list (in which case each input GPFN is overwritten with the output MFN).
      */
-    unsigned long *mfn_list;
+    GUEST_HANDLE(xen_ulong) mfn_list;
 } xen_translate_gpfn_list_t;
+DEFINE_GUEST_HANDLE(xen_translate_gpfn_list_t);
 
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/vcpu.h
--- a/xen/include/public/vcpu.h Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/vcpu.h Wed Mar  1 19:47:25 2006
@@ -51,6 +51,61 @@
 /* Returns 1 if the given VCPU is up. */
 #define VCPUOP_is_up                3
 
+/*
+ * Return information about the state and running time of a VCPU.
+ * @extra_arg == pointer to vcpu_runstate_info structure.
+ */
+#define VCPUOP_get_runstate_info    4
+typedef struct vcpu_runstate_info {
+    /* VCPU's current state (RUNSTATE_*). */
+    int      state;
+    /* When was current state entered (system time, ns)? */
+    uint64_t state_entry_time;
+    /*
+     * Time spent in each RUNSTATE_* (ns). The sum of these times is
+     * guaranteed not to drift from system time.
+     */
+    uint64_t time[4];
+} vcpu_runstate_info_t;
+
+/* VCPU is currently running on a physical CPU. */
+#define RUNSTATE_running  0
+
+/* VCPU is runnable, but not currently scheduled on any physical CPU. */
+#define RUNSTATE_runnable 1
+
+/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
+#define RUNSTATE_blocked  2
+
+/*
+ * VCPU is not runnable, but it is not blocked.
+ * This is a 'catch all' state for things like hotplug and pauses by the
+ * system administrator (or for critical sections in the hypervisor).
+ * RUNSTATE_blocked dominates this state (it is the preferred state).
+ */
+#define RUNSTATE_offline  3
+
+/*
+ * Register a shared memory area from which the guest may obtain its own
+ * runstate information without needing to execute a hypercall.
+ * Notes:
+ *  1. The registered address may be virtual or physical, depending on the
+ *     platform. The virtual address should be registered on x86 systems.
+ *  2. Only one shared area may be registered per VCPU. The shared area is
+ *     updated by the hypervisor each time the VCPU is scheduled. Thus
+ *     runstate.state will always be RUNSTATE_running and
+ *     runstate.state_entry_time will indicate the system time at which the
+ *     VCPU was last scheduled to run.
+ * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
+ */
+#define VCPUOP_register_runstate_memory_area 5
+typedef struct vcpu_register_runstate_memory_area {
+    union {
+        struct vcpu_runstate_info *v;
+        uint64_t p;
+    } addr;
+} vcpu_register_runstate_memory_area_t;
+
 #endif /* __XEN_PUBLIC_VCPU_H__ */
 
 /*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/version.h
--- a/xen/include/public/version.h      Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/version.h      Wed Mar  1 19:47:25 2006
@@ -48,36 +48,8 @@
     uint32_t     submap;        /* OUT: 32-bit submap */
 } xen_feature_info_t;
 
-/*
- * If set, the guest does not need to write-protect its pagetables, and can
- * update them via direct writes.
- */
-#define XENFEAT_writable_page_tables       0
-
-/*
- * If set, the guest does not need to write-protect its segment descriptor
- * tables, and can update them via direct writes.
- */
-#define XENFEAT_writable_descriptor_tables 1
-
-/*
- * If set, translation between the guest's 'pseudo-physical' address space
- * and the host's machine address space are handled by the hypervisor. In this
- * mode the guest does not need to perform phys-to/from-machine translations
- * when performing page table operations.
- */
-#define XENFEAT_auto_translated_physmap    2
-
-/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
-#define XENFEAT_supervisor_mode_kernel     3
-
-/*
- * If set, the guest does not need to allocate x86 PAE page directories
- * below 4GB. This flag is usually implied by auto_translated_physmap.
- */
-#define XENFEAT_pae_pgdir_above_4gb        4
-
-#define XENFEAT_NR_SUBMAPS 1
+/* Declares the features reported by XENVER_get_features. */
+#include "features.h"
 
 #endif /* __XEN_PUBLIC_VERSION_H__ */
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/xen.h
--- a/xen/include/public/xen.h  Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/xen.h  Wed Mar  1 19:47:25 2006
@@ -8,6 +8,22 @@
 
 #ifndef __XEN_PUBLIC_XEN_H__
 #define __XEN_PUBLIC_XEN_H__
+
+#ifdef __XEN__
+#define DEFINE_GUEST_HANDLE(type) struct __guest_handle_ ## type { type *p; }
+#define GUEST_HANDLE(type)        struct __guest_handle_ ## type
+#else
+#define DEFINE_GUEST_HANDLE(type)
+#define GUEST_HANDLE(type)        type *
+#endif
+
+#ifndef __ASSEMBLY__
+/* Guest handle for unsigned long pointer. Define a name with no whitespace. */
+typedef unsigned long xen_ulong;
+DEFINE_GUEST_HANDLE(xen_ulong);
+/* Guest handle for arbitrary-type pointer (void *). */
+DEFINE_GUEST_HANDLE(void);
+#endif
 
 #if defined(__i386__)
 #include "arch-x86_32.h"
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h        Wed Mar  1 17:01:54 2006
+++ b/xen/include/xen/sched-if.h        Wed Mar  1 19:47:25 2006
@@ -8,9 +8,6 @@
 #ifndef __XEN_SCHED_IF_H__
 #define __XEN_SCHED_IF_H__
 
-#define BUCKETS  10
-/*300*/
-
 struct schedule_data {
     spinlock_t          schedule_lock;  /* spinlock protecting curr        */
     struct vcpu        *curr;           /* current task                    */
@@ -18,9 +15,6 @@
     void               *sched_priv;
     struct timer        s_timer;        /* scheduling timer                */
     unsigned long       tick;           /* current periodic 'tick'         */
-#ifdef BUCKETS
-    u32                 hist[BUCKETS];  /* for scheduler latency histogram */
-#endif
 } __cacheline_aligned;
 
 extern struct schedule_data schedule_data[];
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Wed Mar  1 17:01:54 2006
+++ b/xen/include/xen/sched.h   Wed Mar  1 19:47:25 2006
@@ -8,6 +8,7 @@
 #include <xen/smp.h>
 #include <public/xen.h>
 #include <public/dom0_ops.h>
+#include <public/vcpu.h>
 #include <xen/time.h>
 #include <xen/timer.h>
 #include <xen/grant_table.h>
@@ -63,14 +64,13 @@
 
     struct vcpu     *next_in_list;
 
-    struct timer  timer;         /* one-shot timer for timeout values */
+    struct timer     timer;         /* one-shot timer for timeout values */
     unsigned long    sleep_tick;    /* tick at which this vcpu started sleep */
 
-    s_time_t         lastschd;      /* time this domain was last scheduled */
-    s_time_t         lastdeschd;    /* time this domain was last descheduled */
-    s_time_t         cpu_time;      /* total CPU time received till now */
-    s_time_t         wokenup;       /* time domain got woken up */
     void            *sched_priv;    /* scheduler-specific data */
+
+    struct vcpu_runstate_info runstate;
+    struct vcpu_runstate_info *runstate_guest; /* guest address */
 
     unsigned long    vcpu_flags;
 
@@ -303,31 +303,18 @@
 
 void startup_cpu_idle_loop(void);
 
-unsigned long __hypercall_create_continuation(
-    unsigned int op, unsigned int nr_args, ...);
-#define hypercall0_create_continuation(_op)                               \
-    __hypercall_create_continuation((_op), 0)
-#define hypercall1_create_continuation(_op, _a1)                          \
-    __hypercall_create_continuation((_op), 1,                             \
-        (unsigned long)(_a1))
-#define hypercall2_create_continuation(_op, _a1, _a2)                     \
-    __hypercall_create_continuation((_op), 2,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2))
-#define hypercall3_create_continuation(_op, _a1, _a2, _a3)                \
-    __hypercall_create_continuation((_op), 3,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3))
-#define hypercall4_create_continuation(_op, _a1, _a2, _a3, _a4)           \
-    __hypercall_create_continuation((_op), 4,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
-        (unsigned long)(_a4))
-#define hypercall5_create_continuation(_op, _a1, _a2, _a3, _a4, _a5)      \
-    __hypercall_create_continuation((_op), 5,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
-        (unsigned long)(_a4), (unsigned long)(_a5))
-#define hypercall6_create_continuation(_op, _a1, _a2, _a3, _a4, _a5, _a6) \
-    __hypercall_create_continuation((_op), 6,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
-        (unsigned long)(_a4), (unsigned long)(_a5), (unsigned long)(_a6))
+/*
+ * Creates a continuation to resume the current hypercall. The caller should
+ * return immediately, propagating the value returned from this invocation.
+ * The format string specifies the types and number of hypercall arguments.
+ * It contains one character per argument as follows:
+ *  'i' [unsigned] {char, int}
+ *  'l' [unsigned] long
+ *  'p' pointer (foo *)
+ *  'h' guest handle (GUEST_HANDLE(foo))
+ */
+unsigned long hypercall_create_continuation(
+    unsigned int op, const char *format, ...);
 
 #define hypercall_preempt_check() (unlikely(    \
         softirq_pending(smp_processor_id()) |   \
@@ -397,7 +384,6 @@
 #define _DOMF_debugging        4
 #define DOMF_debugging         (1UL<<_DOMF_debugging)
 
-
 static inline int vcpu_runnable(struct vcpu *v)
 {
     return ( (atomic_read(&v->pausecnt) == 0) &&
@@ -415,6 +401,8 @@
 
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
+
 static inline void vcpu_unblock(struct vcpu *v)
 {
     if ( test_and_clear_bit(_VCPUF_blocked, &v->vcpu_flags) )
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/string.h
--- a/xen/include/xen/string.h  Wed Mar  1 17:01:54 2006
+++ b/xen/include/xen/string.h  Wed Mar  1 19:47:25 2006
@@ -24,6 +24,9 @@
 #endif
 #ifndef __HAVE_ARCH_STRNCPY
 extern char * strncpy(char *,const char *, __kernel_size_t);
+#endif
+#ifndef __HAVE_ARCH_STRLCPY
+extern size_t strlcpy(char *,const char *, __kernel_size_t);
 #endif
 #ifndef __HAVE_ARCH_STRCAT
 extern char * strcat(char *, const char *);
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/mm/pgtable.c
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable.c       Wed Mar  1 19:47:25 2006
@@ -0,0 +1,283 @@
+/*
+ *  linux/arch/i386/mm/pgtable.c
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+void show_mem(void)
+{
+       int total = 0, reserved = 0;
+       int shared = 0, cached = 0;
+       int highmem = 0;
+       struct page *page;
+       pg_data_t *pgdat;
+       unsigned long i;
+       struct page_state ps;
+       unsigned long flags;
+
+       printk(KERN_INFO "Mem-info:\n");
+       show_free_areas();
+       printk(KERN_INFO "Free swap:       %6ldkB\n", 
nr_swap_pages<<(PAGE_SHIFT-10));
+       for_each_pgdat(pgdat) {
+               pgdat_resize_lock(pgdat, &flags);
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+                       page = pgdat_page_nr(pgdat, i);
+                       total++;
+                       if (PageHighMem(page))
+                               highmem++;
+                       if (PageReserved(page))
+                               reserved++;
+                       else if (PageSwapCache(page))
+                               cached++;
+                       else if (page_count(page))
+                               shared += page_count(page) - 1;
+               }
+               pgdat_resize_unlock(pgdat, &flags);
+       }
+       printk(KERN_INFO "%d pages of RAM\n", total);
+       printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
+       printk(KERN_INFO "%d reserved pages\n", reserved);
+       printk(KERN_INFO "%d pages shared\n", shared);
+       printk(KERN_INFO "%d pages swap cached\n", cached);
+
+       get_page_state(&ps);
+       printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
+       printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
+       printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
+       printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
+       printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               BUG();
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               BUG();
+               return;
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               BUG();
+               return;
+       }
+       pte = pte_offset_kernel(pmd, vaddr);
+       /* <pfn,flags> stored as-is, to permit clearing entries */
+       set_pte(pte, pfn_pte(pfn, flags));
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
+               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
+               return; /* BUG(); */
+       }
+       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
+               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
+               return; /* BUG(); */
+       }
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
+               return; /* BUG(); */
+       }
+       pud = pud_offset(pgd, vaddr);
+       pmd = pmd_offset(pud, vaddr);
+       set_pmd(pmd, pfn_pmd(pfn, flags));
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+static int nr_fixmaps = 0;
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t 
flags)
+{
+       unsigned long address = __fix_to_virt(idx);
+
+       if (idx >= __end_of_fixed_addresses) {
+               BUG();
+               return;
+       }
+       set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+       nr_fixmaps++;
+}
+
+void set_fixaddr_top(unsigned long top)
+{
+       BUG_ON(nr_fixmaps > 0);
+       __FIXADDR_TOP = top - PAGE_SIZE;
+}
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+       return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+
+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+       struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+       return pte;
+}
+
+void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
+{
+       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * The locking scheme was chosen on the basis of manfred's
+ * recommendations and having no core impact whatsoever.
+ * -- wli
+ */
+DEFINE_SPINLOCK(pgd_lock);
+struct page *pgd_list;
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+       struct page *page = virt_to_page(pgd);
+       page->index = (unsigned long)pgd_list;
+       if (pgd_list)
+               set_page_private(pgd_list, (unsigned long)&page->index);
+       pgd_list = page;
+       set_page_private(page, (unsigned long)&pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+       struct page *next, **pprev, *page = virt_to_page(pgd);
+       next = (struct page *)page->index;
+       pprev = (struct page **)page_private(page);
+       *pprev = next;
+       if (next)
+               set_page_private(next, (unsigned long)pprev);
+}
+
+void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+{
+       unsigned long flags;
+
+       if (PTRS_PER_PMD == 1) {
+               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+               spin_lock_irqsave(&pgd_lock, flags);
+       }
+
+       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+                       swapper_pg_dir + USER_PTRS_PER_PGD,
+                       KERNEL_PGD_PTRS);
+       if (PTRS_PER_PMD > 1)
+               return;
+
+       pgd_list_add(pgd);
+       spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+/* never called when PTRS_PER_PMD > 1 */
+void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+{
+       unsigned long flags; /* can be called from interrupt context */
+
+       spin_lock_irqsave(&pgd_lock, flags);
+       pgd_list_del(pgd);
+       spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+       int i;
+       pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
+
+       if (PTRS_PER_PMD == 1 || !pgd)
+               return pgd;
+
+       for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+               pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+               if (!pmd)
+                       goto out_oom;
+               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+       }
+       return pgd;
+
+out_oom:
+       for (i--; i >= 0; i--)
+               kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+       kmem_cache_free(pgd_cache, pgd);
+       return NULL;
+}
+
+void pgd_free(pgd_t *pgd)
+{
+       int i;
+
+       /* in the PAE case user pgd entries are overwritten before usage */
+       if (PTRS_PER_PMD > 1)
+               for (i = 0; i < USER_PTRS_PER_PGD; ++i)
+                       kmem_cache_free(pmd_cache, (void 
*)__va(pgd_val(pgd[i])-1));
+       /* in the non-PAE case, free_pgtables() clears user pgd entries */
+       kmem_cache_free(pgd_cache, pgd);
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-i386/fixmap.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/fixmap.h    Wed Mar  1 19:47:25 2006
@@ -0,0 +1,151 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#ifndef _ASM_FIXMAP_H
+#define _ASM_FIXMAP_H
+
+#include <linux/config.h>
+
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#ifdef CONFIG_HIGHMEM
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#endif
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process. We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * these 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages. (or larger if used with an increment
+ * highger than 1) use fixmap_set(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+       FIX_HOLE,
+#ifdef CONFIG_X86_LOCAL_APIC
+       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+       FIX_IO_APIC_BASE_0,
+       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+       FIX_CO_CPU,     /* Cobalt timer */
+       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */ 
+       FIX_LI_PCIA,    /* Lithium PCI Bridge A */
+       FIX_LI_PCIB,    /* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+       FIX_F00F_IDT,   /* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+       FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif 
+#ifdef CONFIG_HIGHMEM
+       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#endif
+#ifdef CONFIG_ACPI
+       FIX_ACPI_BEGIN,
+       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+       FIX_PCIE_MCFG,
+#endif
+       __end_of_permanent_fixed_addresses,
+       /* temporary boot-time mappings, used before ioremap() is functional */
+#define NR_FIX_BTMAPS  16
+       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
+       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
+       FIX_WP_TEST,
+       __end_of_fixed_addresses
+};
+
+extern void __set_fixmap (enum fixed_addresses idx,
+                                       unsigned long phys, pgprot_t flags);
+
+extern void set_fixaddr_top(unsigned long top);
+
+#define set_fixmap(idx, phys) \
+               __set_fixmap(idx, phys, PAGE_KERNEL)
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define clear_fixmap(idx) \
+               __set_fixmap(idx, 0, __pgprot(0))
+
+#define FIXADDR_TOP    ((unsigned long)__FIXADDR_TOP)
+
+#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define __FIXADDR_BOOT_SIZE    (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START          (FIXADDR_TOP - __FIXADDR_SIZE)
+#define FIXADDR_BOOT_START     (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
+
+#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without tranlation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+       /*
+        * this branch gets completely eliminated after inlining,
+        * except when someone tries to use fixaddr indices in an
+        * illegal way. (such as mixing up address types or using
+        * out-of-range indices).
+        *
+        * If it doesn't get removed, the linker will complain
+        * loudly with a reasonably clear error message..
+        */
+       if (idx >= __end_of_fixed_addresses)
+               __this_fixmap_does_not_exist();
+
+        return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+       BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+       return __virt_to_fix(vaddr);
+}
+
+#endif /* !__ASSEMBLY__ */
+#endif
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-i386/page.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/page.h      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,148 @@
+#ifndef _I386_PAGE_H
+#define _I386_PAGE_H
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT     12
+#define PAGE_SIZE      (1UL << PAGE_SHIFT)
+#define PAGE_MASK      (~(PAGE_SIZE-1))
+
+#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
+#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+
+#include <linux/config.h>
+
+#ifdef CONFIG_X86_USE_3DNOW
+
+#include <asm/mmx.h>
+
+#define clear_page(page)       mmx_clear_page((void *)(page))
+#define copy_page(to,from)     mmx_copy_page(to,from)
+
+#else
+
+/*
+ *     On older X86 processors it's not a win to use MMX here it seems.
+ *     Maybe the K6-III ?
+ */
+ 
+#define clear_page(page)       memset((void *)(page), 0, PAGE_SIZE)
+#define copy_page(to,from)     memcpy((void *)(to), (void *)(from), PAGE_SIZE)
+
+#endif
+
+#define clear_user_page(page, vaddr, pg)       clear_page(page)
+#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
+
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | 
__GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
+/*
+ * These are used to make use of C type-checking..
+ */
+extern int nx_enabled;
+#ifdef CONFIG_X86_PAE
+extern unsigned long long __supported_pte_mask;
+typedef struct { unsigned long pte_low, pte_high; } pte_t;
+typedef struct { unsigned long long pmd; } pmd_t;
+typedef struct { unsigned long long pgd; } pgd_t;
+typedef struct { unsigned long long pgprot; } pgprot_t;
+#define pmd_val(x)     ((x).pmd)
+#define pte_val(x)     ((x).pte_low | ((unsigned long long)(x).pte_high << 32))
+#define __pmd(x) ((pmd_t) { (x) } )
+#define HPAGE_SHIFT    21
+#else
+typedef struct { unsigned long pte_low; } pte_t;
+typedef struct { unsigned long pgd; } pgd_t;
+typedef struct { unsigned long pgprot; } pgprot_t;
+#define boot_pte_t pte_t /* or would you rather have a typedef */
+#define pte_val(x)     ((x).pte_low)
+#define HPAGE_SHIFT    22
+#endif
+#define PTE_MASK       PAGE_MASK
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
+#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#endif
+
+#define pgd_val(x)     ((x).pgd)
+#define pgprot_val(x)  ((x).pgprot)
+
+#define __pte(x) ((pte_t) { (x) } )
+#define __pgd(x) ((pgd_t) { (x) } )
+#define __pgprot(x)    ((pgprot_t) { (x) } )
+
+#endif /* !__ASSEMBLY__ */
+
+/* to align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+
+/*
+ * This handles the memory map.. We could make this a config
+ * option, but too many people screw it up, and too few need
+ * it.
+ *
+ * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
+ * a virtual address space of one gigabyte, which limits the
+ * amount of physical memory you can use to about 950MB. 
+ *
+ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
+ * and CONFIG_HIGHMEM64G options in the kernel configuration.
+ */
+
+#ifndef __ASSEMBLY__
+
+/*
+ * This much address space is reserved for vmalloc() and iomap()
+ * as well as fixmap mappings.
+ */
+extern unsigned int __VMALLOC_RESERVE;
+
+extern int sysctl_legacy_va_layout;
+
+extern int page_is_ram(unsigned long pagenr);
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef __ASSEMBLY__
+#define __PAGE_OFFSET          CONFIG_PAGE_OFFSET
+#define __PHYSICAL_START       CONFIG_PHYSICAL_START
+#else
+#define __PAGE_OFFSET          ((unsigned long)CONFIG_PAGE_OFFSET)
+#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
+#endif
+#define __KERNEL_START         (__PAGE_OFFSET + __PHYSICAL_START)
+
+
+#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
+#define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
+#define MAXMEM                 (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
+#define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
+#define __va(x)                        ((void *)((unsigned 
long)(x)+PAGE_OFFSET))
+#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
+#ifdef CONFIG_FLATMEM
+#define pfn_to_page(pfn)       (mem_map + (pfn))
+#define page_to_pfn(page)      ((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn)         ((pfn) < max_mapnr)
+#endif /* CONFIG_FLATMEM */
+#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+
+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+
+#define VM_DATA_DEFAULT_FLAGS \
+       (VM_READ | VM_WRITE | \
+       ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#define __HAVE_ARCH_GATE_AREA 1
+
+#endif /* __KERNEL__ */
+
+#include <asm-generic/page.h>
+
+#endif /* _I386_PAGE_H */
diff -r 88f97bb8f3ae -r 673f62edbfbe 
patches/linux-2.6.16-rc5/i386-mach-io-check-nmi.patch
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/i386-mach-io-check-nmi.patch     Wed Mar  1 
19:47:25 2006
@@ -0,0 +1,45 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c 
./arch/i386/kernel/traps.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c      2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/kernel/traps.c 2006-02-27 15:55:23.000000000 +0000
+@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch
+ 
+ static void io_check_error(unsigned char reason, struct pt_regs * regs)
+ {
+-      unsigned long i;
+-
+       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+       show_registers(regs);
+ 
+       /* Re-enable the IOCK line, wait for a few seconds */
+-      reason = (reason & 0xf) | 8;
+-      outb(reason, 0x61);
+-      i = 2000;
+-      while (--i) udelay(1000);
+-      reason &= ~8;
+-      outb(reason, 0x61);
++      clear_io_check_error(reason);
+ }
+ 
+ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+diff -pruN 
../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h 
./include/asm-i386/mach-default/mach_traps.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h    
2006-01-03 03:21:10.000000000 +0000
++++ ./include/asm-i386/mach-default/mach_traps.h       2006-02-27 
15:55:23.000000000 +0000
+@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
+       outb(reason, 0x61);
+ }
+ 
++static inline void clear_io_check_error(unsigned char reason)
++{
++      unsigned long i;
++
++      reason = (reason & 0xf) | 8;
++      outb(reason, 0x61);
++      i = 2000;
++      while (--i) udelay(1000);
++      reason &= ~8;
++      outb(reason, 0x61);
++}
++
+ static inline unsigned char get_nmi_reason(void)
+ {
+       return inb(0x61);
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/net-csum.patch
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/net-csum.patch   Wed Mar  1 19:47:25 2006
@@ -0,0 +1,41 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c 
./net/ipv4/netfilter/ip_nat_proto_tcp.c
+--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c 
2006-02-27 15:47:38.000000000 +0000
++++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c    2006-02-27 15:55:25.000000000 
+0000
+@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb,
+       if (hdrsize < sizeof(*hdr))
+               return 1;
+ 
+-      hdr->check = ip_nat_cheat_check(~oldip, newip,
++      if ((*pskb)->proto_csum_blank) {
++              hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
++      } else {
++              hdr->check = ip_nat_cheat_check(~oldip, newip,
+                                       ip_nat_cheat_check(oldport ^ 0xFFFF,
+                                                          newport,
+                                                          hdr->check));
++      }
+       return 1;
+ }
+ 
+diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c 
./net/ipv4/netfilter/ip_nat_proto_udp.c
+--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c 
2006-02-27 15:47:38.000000000 +0000
++++ ./net/ipv4/netfilter/ip_nat_proto_udp.c    2006-02-27 15:55:25.000000000 
+0000
+@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb,
+               newport = tuple->dst.u.udp.port;
+               portptr = &hdr->dest;
+       }
+-      if (hdr->check) /* 0 is a special case meaning no checksum */
+-              hdr->check = ip_nat_cheat_check(~oldip, newip,
++      if (hdr->check) { /* 0 is a special case meaning no checksum */
++              if ((*pskb)->proto_csum_blank) {
++                      hdr->check = ip_nat_cheat_check(oldip, ~newip, 
hdr->check);
++              } else {
++                      hdr->check = ip_nat_cheat_check(~oldip, newip,
+                                       ip_nat_cheat_check(*portptr ^ 0xFFFF,
+                                                          newport,
+                                                          hdr->check));
++              }
++      }
+       *portptr = newport;
+       return 1;
+ }
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/pmd-shared.patch
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/pmd-shared.patch Wed Mar  1 19:47:25 2006
@@ -0,0 +1,111 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c 
./arch/i386/mm/pageattr.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c       2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/mm/pageattr.c  2006-02-27 15:55:31.000000000 +0000
+@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns
+       unsigned long flags;
+ 
+       set_pte_atomic(kpte, pte);      /* change init_mm */
+-      if (PTRS_PER_PMD > 1)
++      if (HAVE_SHARED_KERNEL_PMD)
+               return;
+ 
+       spin_lock_irqsave(&pgd_lock, flags);
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c 
./arch/i386/mm/pgtable.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c        2006-01-03 
03:21:10.000000000 +0000
++++ ./arch/i386/mm/pgtable.c   2006-02-27 15:55:31.000000000 +0000
+@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
+               spin_lock_irqsave(&pgd_lock, flags);
+       }
+ 
+-      clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+-                      swapper_pg_dir + USER_PTRS_PER_PGD,
+-                      KERNEL_PGD_PTRS);
++      if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD)
++              clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
++                              swapper_pg_dir + USER_PTRS_PER_PGD,
++                              KERNEL_PGD_PTRS);
+       if (PTRS_PER_PMD > 1)
+               return;
+ 
+@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
+                       goto out_oom;
+               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+       }
++
++      if (!HAVE_SHARED_KERNEL_PMD) {
++              unsigned long flags;
++
++              for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
++                      pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
++                      if (!pmd)
++                              goto out_oom;
++                      set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
++              }
++
++              spin_lock_irqsave(&pgd_lock, flags);
++              for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
++                      unsigned long v = (unsigned long)i << PGDIR_SHIFT;
++                      pgd_t *kpgd = pgd_offset_k(v);
++                      pud_t *kpud = pud_offset(kpgd, v);
++                      pmd_t *kpmd = pmd_offset(kpud, v);
++                      pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
++                      memcpy(pmd, kpmd, PAGE_SIZE);
++              }
++              pgd_list_add(pgd);
++              spin_unlock_irqrestore(&pgd_lock, flags);
++      }
++
+       return pgd;
+ 
+ out_oom:
+@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd)
+       int i;
+ 
+       /* in the PAE case user pgd entries are overwritten before usage */
+-      if (PTRS_PER_PMD > 1)
+-              for (i = 0; i < USER_PTRS_PER_PGD; ++i)
+-                      kmem_cache_free(pmd_cache, (void 
*)__va(pgd_val(pgd[i])-1));
++      if (PTRS_PER_PMD > 1) {
++              for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
++                      pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
++                      kmem_cache_free(pmd_cache, pmd);
++              }
++              if (!HAVE_SHARED_KERNEL_PMD) {
++                      unsigned long flags;
++                      spin_lock_irqsave(&pgd_lock, flags);
++                      pgd_list_del(pgd);
++                      spin_unlock_irqrestore(&pgd_lock, flags);
++                      for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
++                              pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
++                              memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
++                              kmem_cache_free(pmd_cache, pmd);
++                      }
++              }
++      }
+       /* in the non-PAE case, free_pgtables() clears user pgd entries */
+       kmem_cache_free(pgd_cache, pgd);
+ }
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h 
./include/asm-i386/pgtable-2level-defs.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h        
2006-01-03 03:21:10.000000000 +0000
++++ ./include/asm-i386/pgtable-2level-defs.h   2006-02-27 15:55:31.000000000 
+0000
+@@ -1,6 +1,8 @@
+ #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
+ #define _I386_PGTABLE_2LEVEL_DEFS_H
+ 
++#define HAVE_SHARED_KERNEL_PMD 0
++
+ /*
+  * traditional i386 two-level paging structure:
+  */
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h 
./include/asm-i386/pgtable-3level-defs.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h        
2006-01-03 03:21:10.000000000 +0000
++++ ./include/asm-i386/pgtable-3level-defs.h   2006-02-27 15:55:31.000000000 
+0000
+@@ -1,6 +1,8 @@
+ #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
+ #define _I386_PGTABLE_3LEVEL_DEFS_H
+ 
++#define HAVE_SHARED_KERNEL_PMD 1
++
+ /*
+  * PGDIR_SHIFT determines what a top-level page table entry can map
+  */
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/smp-alts.patch
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/smp-alts.patch   Wed Mar  1 19:47:25 2006
@@ -0,0 +1,591 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig ./arch/i386/Kconfig
+--- ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig     2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/Kconfig        2006-02-27 15:55:34.000000000 +0000
+@@ -202,6 +202,19 @@ config SMP
+ 
+         If you don't know what to do here, say N.
+ 
++config SMP_ALTERNATIVES
++      bool "SMP alternatives support (EXPERIMENTAL)"
++      depends on SMP && EXPERIMENTAL
++      help
++        Try to reduce the overhead of running an SMP kernel on a uniprocessor
++        host slightly by replacing certain key instruction sequences
++        according to whether we currently have more than one CPU available.
++        This should provide a noticeable boost to performance when
++        running SMP kernels on UP machines, and have negligible impact
++        when running on an true SMP host.
++
++          If unsure, say N.
++        
+ config NR_CPUS
+       int "Maximum number of CPUs (2-255)"
+       range 2 255
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile 
./arch/i386/kernel/Makefile
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile     2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/kernel/Makefile        2006-02-27 15:55:34.000000000 +0000
+@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI)            += efi.o efi_stub.o
+ obj-$(CONFIG_DOUBLEFAULT)     += doublefault.o
+ obj-$(CONFIG_VM86)            += vm86.o
+ obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
++obj-$(CONFIG_SMP_ALTERNATIVES)  += smpalts.o
+ 
+ EXTRA_AFLAGS   := -traditional
+ 
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c 
./arch/i386/kernel/smpalts.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c    1970-01-01 
01:00:00.000000000 +0100
++++ ./arch/i386/kernel/smpalts.c       2006-02-27 15:55:34.000000000 +0000
+@@ -0,0 +1,85 @@
++#include <linux/kernel.h>
++#include <asm/system.h>
++#include <asm/smp_alt.h>
++#include <asm/processor.h>
++#include <asm/string.h>
++
++struct smp_replacement_record {
++      unsigned char targ_size;
++      unsigned char smp1_size;
++      unsigned char smp2_size;
++      unsigned char up_size;
++      unsigned char feature;
++      unsigned char data[0];
++};
++
++struct smp_alternative_record {
++      void *targ_start;
++      struct smp_replacement_record *repl;
++};
++
++extern struct smp_alternative_record __start_smp_alternatives_table,
++  __stop_smp_alternatives_table;
++extern unsigned long __init_begin, __init_end;
++
++void prepare_for_smp(void)
++{
++      struct smp_alternative_record *r;
++      printk(KERN_INFO "Enabling SMP...\n");
++      for (r = &__start_smp_alternatives_table;
++           r != &__stop_smp_alternatives_table;
++           r++) {
++              BUG_ON(r->repl->targ_size < r->repl->smp1_size);
++              BUG_ON(r->repl->targ_size < r->repl->smp2_size);
++              BUG_ON(r->repl->targ_size < r->repl->up_size);
++               if (system_state == SYSTEM_RUNNING &&
++                   r->targ_start >= (void *)&__init_begin &&
++                   r->targ_start < (void *)&__init_end)
++                       continue;
++              if (r->repl->feature != (unsigned char)-1 &&
++                  boot_cpu_has(r->repl->feature)) {
++                      memcpy(r->targ_start,
++                             r->repl->data + r->repl->smp1_size,
++                             r->repl->smp2_size);
++                      memset(r->targ_start + r->repl->smp2_size,
++                             0x90,
++                             r->repl->targ_size - r->repl->smp2_size);
++              } else {
++                      memcpy(r->targ_start,
++                             r->repl->data,
++                             r->repl->smp1_size);
++                      memset(r->targ_start + r->repl->smp1_size,
++                             0x90,
++                             r->repl->targ_size - r->repl->smp1_size);
++              }
++      }
++      /* Paranoia */
++      asm volatile ("jmp 1f\n1:");
++      mb();
++}
++
++void unprepare_for_smp(void)
++{
++      struct smp_alternative_record *r;
++      printk(KERN_INFO "Disabling SMP...\n");
++      for (r = &__start_smp_alternatives_table;
++           r != &__stop_smp_alternatives_table;
++           r++) {
++              BUG_ON(r->repl->targ_size < r->repl->smp1_size);
++              BUG_ON(r->repl->targ_size < r->repl->smp2_size);
++              BUG_ON(r->repl->targ_size < r->repl->up_size);
++               if (system_state == SYSTEM_RUNNING &&
++                   r->targ_start >= (void *)&__init_begin &&
++                   r->targ_start < (void *)&__init_end)
++                       continue;
++              memcpy(r->targ_start,
++                     r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
++                     r->repl->up_size);
++              memset(r->targ_start + r->repl->up_size,
++                     0x90,
++                     r->repl->targ_size - r->repl->up_size);
++      }
++      /* Paranoia */
++      asm volatile ("jmp 1f\n1:");
++      mb();
++}
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c 
./arch/i386/kernel/smpboot.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c    2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/kernel/smpboot.c       2006-02-27 15:55:34.000000000 +0000
+@@ -1208,6 +1208,11 @@ static void __init smp_boot_cpus(unsigne
+               if (max_cpus <= cpucount+1)
+                       continue;
+ 
++#ifdef CONFIG_SMP_ALTERNATIVES
++              if (kicked == 1)
++                      prepare_for_smp();
++#endif
++
+               if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
+                       printk("CPU #%d not responding - cannot use it.\n",
+                                                               apicid);
+@@ -1386,6 +1391,11 @@ int __devinit __cpu_up(unsigned int cpu)
+               return -EIO;
+       }
+ 
++#ifdef CONFIG_SMP_ALTERNATIVES
++      if (num_online_cpus() == 1)
++              prepare_for_smp();
++#endif
++
+       local_irq_enable();
+       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+       /* Unleash the CPU! */
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S 
./arch/i386/kernel/vmlinux.lds.S
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S        
2006-01-03 03:21:10.000000000 +0000
++++ ./arch/i386/kernel/vmlinux.lds.S   2006-02-27 15:55:34.000000000 +0000
+@@ -34,6 +34,13 @@ SECTIONS
+   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
+   __stop___ex_table = .;
+ 
++  . = ALIGN(16);
++  __start_smp_alternatives_table = .;
++  __smp_alternatives : { *(__smp_alternatives) }
++  __stop_smp_alternatives_table = .;
++
++  __smp_replacements : { *(__smp_replacements) }
++
+   RODATA
+ 
+   /* writeable */
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h 
./include/asm-i386/atomic.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h     2006-02-27 
15:47:25.000000000 +0000
++++ ./include/asm-i386/atomic.h        2006-02-27 15:55:34.000000000 +0000
+@@ -4,18 +4,13 @@
+ #include <linux/config.h>
+ #include <linux/compiler.h>
+ #include <asm/processor.h>
++#include <asm/smp_alt.h>
+ 
+ /*
+  * Atomic operations that C can't guarantee us.  Useful for
+  * resource counting etc..
+  */
+ 
+-#ifdef CONFIG_SMP
+-#define LOCK "lock ; "
+-#else
+-#define LOCK ""
+-#endif
+-
+ /*
+  * Make sure gcc doesn't try to be clever and move things around
+  * on us. We need to use _exactly_ the address the user gave us,
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h 
./include/asm-i386/bitops.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h     2006-02-27 
15:47:25.000000000 +0000
++++ ./include/asm-i386/bitops.h        2006-02-27 15:55:34.000000000 +0000
+@@ -7,6 +7,7 @@
+ 
+ #include <linux/config.h>
+ #include <linux/compiler.h>
++#include <asm/smp_alt.h>
+ 
+ /*
+  * These have to be done with inline assembly: that way the bit-setting
+@@ -16,12 +17,6 @@
+  * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+  */
+ 
+-#ifdef CONFIG_SMP
+-#define LOCK_PREFIX "lock ; "
+-#else
+-#define LOCK_PREFIX ""
+-#endif
+-
+ #define ADDR (*(volatile long *) addr)
+ 
+ /**
+@@ -41,7 +36,7 @@
+  */
+ static inline void set_bit(int nr, volatile unsigned long * addr)
+ {
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btsl %1,%0"
+               :"+m" (ADDR)
+               :"Ir" (nr));
+@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol
+  */
+ static inline void clear_bit(int nr, volatile unsigned long * addr)
+ {
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btrl %1,%0"
+               :"+m" (ADDR)
+               :"Ir" (nr));
+@@ -121,7 +116,7 @@ static inline void __change_bit(int nr, 
+  */
+ static inline void change_bit(int nr, volatile unsigned long * addr)
+ {
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btcl %1,%0"
+               :"+m" (ADDR)
+               :"Ir" (nr));
+@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n
+ {
+       int oldbit;
+ 
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btsl %2,%1\n\tsbbl %0,%0"
+               :"=r" (oldbit),"+m" (ADDR)
+               :"Ir" (nr) : "memory");
+@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int
+ {
+       int oldbit;
+ 
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btrl %2,%1\n\tsbbl %0,%0"
+               :"=r" (oldbit),"+m" (ADDR)
+               :"Ir" (nr) : "memory");
+@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in
+ {
+       int oldbit;
+ 
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btcl %2,%1\n\tsbbl %0,%0"
+               :"=r" (oldbit),"+m" (ADDR)
+               :"Ir" (nr) : "memory");
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h 
./include/asm-i386/futex.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h      2006-02-27 
15:47:25.000000000 +0000
++++ ./include/asm-i386/futex.h 2006-02-27 15:55:34.000000000 +0000
+@@ -28,7 +28,7 @@
+ "1:   movl    %2, %0\n\
+       movl    %0, %3\n"                                       \
+       insn "\n"                                               \
+-"2:   " LOCK_PREFIX "cmpxchgl %3, %2\n\
++"2:   " LOCK "cmpxchgl %3, %2\n\
+       jnz     1b\n\
+ 3:    .section .fixup,\"ax\"\n\
+ 4:    mov     %5, %1\n\
+@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op, 
+ #endif
+               switch (op) {
+               case FUTEX_OP_ADD:
+-                      __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
++                      __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
+                                          oldval, uaddr, oparg);
+                       break;
+               case FUTEX_OP_OR:
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h 
./include/asm-i386/rwsem.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h      2006-01-03 
03:21:10.000000000 +0000
++++ ./include/asm-i386/rwsem.h 2006-02-27 15:55:34.000000000 +0000
+@@ -40,6 +40,7 @@
+ 
+ #include <linux/list.h>
+ #include <linux/spinlock.h>
++#include <asm/smp_alt.h>
+ 
+ struct rwsem_waiter;
+ 
+@@ -99,7 +100,7 @@ static inline void __down_read(struct rw
+ {
+       __asm__ __volatile__(
+               "# beginning down_read\n\t"
+-LOCK_PREFIX   "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old 
value */
++LOCK          "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old 
value */
+               "  js        2f\n\t" /* jump if we weren't granted the lock */
+               "1:\n\t"
+               LOCK_SECTION_START("")
+@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st
+               "  movl      %1,%2\n\t"
+               "  addl      %3,%2\n\t"
+               "  jle       2f\n\t"
+-LOCK_PREFIX   "  cmpxchgl  %2,%0\n\t"
++LOCK          "  cmpxchgl  %2,%0\n\t"
+               "  jnz       1b\n\t"
+               "2:\n\t"
+               "# ending __down_read_trylock\n\t"
+@@ -150,7 +151,7 @@ static inline void __down_write(struct r
+       tmp = RWSEM_ACTIVE_WRITE_BIAS;
+       __asm__ __volatile__(
+               "# beginning down_write\n\t"
+-LOCK_PREFIX   "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns 
the old value */
++LOCK          "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns 
the old value */
+               "  testl     %%edx,%%edx\n\t" /* was the count 0 before? */
+               "  jnz       2f\n\t" /* jump if we weren't granted the lock */
+               "1:\n\t"
+@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s
+       __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
+       __asm__ __volatile__(
+               "# beginning __up_read\n\t"
+-LOCK_PREFIX   "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old 
value */
++LOCK          "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old 
value */
+               "  js        2f\n\t" /* jump if the lock is being waited upon */
+               "1:\n\t"
+               LOCK_SECTION_START("")
+@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_
+       __asm__ __volatile__(
+               "# beginning __up_write\n\t"
+               "  movl      %2,%%edx\n\t"
+-LOCK_PREFIX   "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 
0xffff0001 -> 0x00000000 */
++LOCK          "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 
0xffff0001 -> 0x00000000 */
+               "  jnz       2f\n\t" /* jump if the lock is being waited upon */
+               "1:\n\t"
+               LOCK_SECTION_START("")
+@@ -239,7 +240,7 @@ static inline void __downgrade_write(str
+ {
+       __asm__ __volatile__(
+               "# beginning __downgrade_write\n\t"
+-LOCK_PREFIX   "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 
0xYYYY0001 */
++LOCK          "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 
0xYYYY0001 */
+               "  js        2f\n\t" /* jump if the lock is being waited upon */
+               "1:\n\t"
+               LOCK_SECTION_START("")
+@@ -263,7 +264,7 @@ LOCK_PREFIX        "  addl      %2,(%%eax)\n\t"
+ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+ {
+       __asm__ __volatile__(
+-LOCK_PREFIX   "addl %1,%0"
++LOCK            "addl %1,%0"
+               : "=m"(sem->count)
+               : "ir"(delta), "m"(sem->count));
+ }
+@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in
+       int tmp = delta;
+ 
+       __asm__ __volatile__(
+-LOCK_PREFIX   "xadd %0,(%2)"
++LOCK                    "xadd %0,(%2)"
+               : "+r"(tmp), "=m"(sem->count)
+               : "r"(sem), "m"(sem->count)
+               : "memory");
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h 
./include/asm-i386/smp_alt.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h    1970-01-01 
01:00:00.000000000 +0100
++++ ./include/asm-i386/smp_alt.h       2006-02-27 15:55:34.000000000 +0000
+@@ -0,0 +1,32 @@
++#ifndef __ASM_SMP_ALT_H__
++#define __ASM_SMP_ALT_H__
++
++#include <linux/config.h>
++
++#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
++#define LOCK \
++        "6677: nop\n" \
++      ".section __smp_alternatives,\"a\"\n" \
++      ".long 6677b\n" \
++      ".long 6678f\n" \
++      ".previous\n" \
++      ".section __smp_replacements,\"a\"\n" \
++      "6678: .byte 1\n" \
++      ".byte 1\n" \
++      ".byte 0\n" \
++        ".byte 1\n" \
++      ".byte -1\n" \
++      "lock\n" \
++      "nop\n" \
++      ".previous\n"
++void prepare_for_smp(void);
++void unprepare_for_smp(void);
++#else
++#define LOCK "lock ; "
++#endif
++#else
++#define LOCK ""
++#endif
++
++#endif /* __ASM_SMP_ALT_H__ */
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h 
./include/asm-i386/spinlock.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h   2006-01-03 
03:21:10.000000000 +0000
++++ ./include/asm-i386/spinlock.h      2006-02-27 15:55:34.000000000 +0000
+@@ -6,6 +6,7 @@
+ #include <asm/page.h>
+ #include <linux/config.h>
+ #include <linux/compiler.h>
++#include <asm/smp_alt.h>
+ 
+ /*
+  * Your basic SMP spinlocks, allowing only a single CPU anywhere
+@@ -23,7 +24,8 @@
+ 
+ #define __raw_spin_lock_string \
+       "\n1:\t" \
+-      "lock ; decb %0\n\t" \
++      LOCK \
++      "decb %0\n\t" \
+       "jns 3f\n" \
+       "2:\t" \
+       "rep;nop\n\t" \
+@@ -34,7 +36,8 @@
+ 
+ #define __raw_spin_lock_string_flags \
+       "\n1:\t" \
+-      "lock ; decb %0\n\t" \
++      LOCK \
++      "decb %0\n\t" \
+       "jns 4f\n\t" \
+       "2:\t" \
+       "testl $0x200, %1\n\t" \
+@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags
+ static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+ {
+       char oldval;
++#ifdef CONFIG_SMP_ALTERNATIVES
+       __asm__ __volatile__(
+-              "xchgb %b0,%1"
++              "1:movb %1,%b0\n"
++              "movb $0,%1\n"
++              "2:"
++              ".section __smp_alternatives,\"a\"\n"
++              ".long 1b\n"
++              ".long 3f\n"
++              ".previous\n"
++              ".section __smp_replacements,\"a\"\n"
++              "3: .byte 2b - 1b\n"
++              ".byte 5f-4f\n"
++              ".byte 0\n"
++              ".byte 6f-5f\n"
++              ".byte -1\n"
++              "4: xchgb %b0,%1\n"
++              "5: movb %1,%b0\n"
++              "movb $0,%1\n"
++              "6:\n"
++              ".previous\n"
+               :"=q" (oldval), "=m" (lock->slock)
+               :"0" (0) : "memory");
++#else
++      __asm__ __volatile__(
++              "xchgb %b0,%1\n"
++              :"=q" (oldval), "=m" (lock->slock)
++              :"0" (0) : "memory");
++#endif
+       return oldval > 0;
+ }
+ 
+@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra
+ 
+ static inline void __raw_read_unlock(raw_rwlock_t *rw)
+ {
+-      asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
++      asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
+ }
+ 
+ static inline void __raw_write_unlock(raw_rwlock_t *rw)
+ {
+-      asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
++      asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
+                                : "=m" (rw->lock) : : "memory");
+ }
+ 
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h 
./include/asm-i386/system.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h     2006-02-27 
15:47:25.000000000 +0000
++++ ./include/asm-i386/system.h        2006-02-27 15:55:34.000000000 +0000
+@@ -5,7 +5,7 @@
+ #include <linux/kernel.h>
+ #include <asm/segment.h>
+ #include <asm/cpufeature.h>
+-#include <linux/bitops.h> /* for LOCK_PREFIX */
++#include <asm/smp_alt.h>
+ 
+ #ifdef __KERNEL__
+ 
+@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo
+       unsigned long prev;
+       switch (size) {
+       case 1:
+-              __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
++              __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
+                                    : "=a"(prev)
+                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "memory");
+               return prev;
+       case 2:
+-              __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
++              __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
+                                    : "=a"(prev)
+                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "memory");
+               return prev;
+       case 4:
+-              __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
++              __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
+                                    : "=a"(prev)
+                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "memory");
+@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc
+                                     unsigned long long new)
+ {
+       unsigned long long prev;
+-      __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
++      __asm__ __volatile__(LOCK "cmpxchg8b %3"
+                            : "=A"(prev)
+                            : "b"((unsigned long)new),
+                              "c"((unsigned long)(new >> 32)),
+@@ -503,11 +503,55 @@ struct alt_instr { 
+ #endif
+ 
+ #ifdef CONFIG_SMP
++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
++#define smp_alt_mb(instr)                                           \
++__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
++                   ".section __smp_alternatives,\"a\"\n"          \
++                   ".long 6667b\n"                                \
++                     ".long 6673f\n"                                \
++                   ".previous\n"                                  \
++                   ".section __smp_replacements,\"a\"\n"          \
++                   "6673:.byte 6668b-6667b\n"                     \
++                   ".byte 6670f-6669f\n"                          \
++                   ".byte 6671f-6670f\n"                          \
++                     ".byte 0\n"                                    \
++                   ".byte %c0\n"                                  \
++                   "6669:lock;addl $0,0(%%esp)\n"                 \
++                   "6670:" instr "\n"                             \
++                   "6671:\n"                                      \
++                   ".previous\n"                                  \
++                   :                                              \
++                   : "i" (X86_FEATURE_XMM2)                       \
++                   : "memory")
++#define smp_rmb() smp_alt_mb("lfence")
++#define smp_mb()  smp_alt_mb("mfence")
++#define set_mb(var, value) do {                                     \
++unsigned long __set_mb_temp;                                        \
++__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
++                   ".section __smp_alternatives,\"a\"\n"          \
++                   ".long 6667b\n"                                \
++                   ".long 6673f\n"                                \
++                   ".previous\n"                                  \
++                   ".section __smp_replacements,\"a\"\n"          \
++                   "6673: .byte 6668b-6667b\n"                    \
++                   ".byte 6670f-6669f\n"                          \
++                   ".byte 0\n"                                    \
++                   ".byte 6671f-6670f\n"                          \
++                   ".byte -1\n"                                   \
++                   "6669: xchg %1, %0\n"                          \
++                   "6670:movl %1, %0\n"                           \
++                   "6671:\n"                                      \
++                   ".previous\n"                                  \
++                   : "=m" (var), "=r" (__set_mb_temp)             \
++                   : "1" (value)                                  \
++                   : "memory"); } while (0)
++#else
+ #define smp_mb()      mb()
+ #define smp_rmb()     rmb()
++#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
++#endif
+ #define smp_wmb()     wmb()
+ #define smp_read_barrier_depends()    read_barrier_depends()
+-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
+ #else
+ #define smp_mb()      barrier()
+ #define smp_rmb()     barrier()
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/locking.sh
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/locking.sh Wed Mar  1 19:47:25 2006
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+#
+# Serialisation
+#
+
+LOCK_SLEEPTIME=1
+LOCK_SPINNING_RETRIES=5
+LOCK_RETRIES=10
+LOCK_BASEDIR=/var/run/xen-hotplug
+
+
+claim_lock()
+{
+  local lockdir="$LOCK_BASEDIR/$1"
+  mkdir -p "$LOCK_BASEDIR"
+  _claim_lock "$lockdir"
+}
+
+
+release_lock()
+{
+  _release_lock "$LOCK_BASEDIR/$1"
+}
+
+
+_claim_lock()
+{
+  local lockdir="$1"
+  local owner=$(_lock_owner "$lockdir")
+  local retries=0
+
+  while [ $retries -lt $LOCK_RETRIES ]
+  do
+    mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR &&
+      _update_lock_info "$lockdir" && return
+
+    local new_owner=$(_lock_owner "$lockdir")
+    if [ "$new_owner" != "$owner" ]
+    then
+      owner="$new_owner"
+      retries=0
+    fi
+
+    if [ $retries -gt $LOCK_SPINNING_RETRIES ]
+    then
+      sleep $LOCK_SLEEPTIME
+    else
+      sleep 0
+    fi
+    retries=$(($retries + 1))
+  done
+  _steal_lock "$lockdir"
+}
+
+
+_release_lock()
+{
+  trap sigerr ERR
+  rm -rf "$1" 2>/dev/null || true
+}
+
+
+_steal_lock()
+{
+  local lockdir="$1"
+  local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown")
+  log err "Forced to steal lock on $lockdir from $owner!"
+  _release_lock "$lockdir"
+  _claim_lock "$lockdir"
+}
+
+
+_lock_owner()
+{
+  cat "$1/owner" 2>/dev/null || echo "unknown"
+}
+
+
+_update_lock_info()
+{
+  echo "$$: $0" >"$1/owner"
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/logging.sh
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/logging.sh Wed Mar  1 19:47:25 2006
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+log() {
+  local level="$1"
+  shift
+  logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-delete
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vtpm-delete        Wed Mar  1 19:47:25 2006
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+# This scripts must be called the following way:
+# vtpm-delete <domain name>
+
+dir=$(dirname "$0")
+. "$dir/vtpm-common.sh"
+
+vtpm_delete_instance $1
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-hotplug-common.sh
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vtpm-hotplug-common.sh     Wed Mar  1 19:47:25 2006
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2005 IBM Corporation
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+findCommand "$@"
+if [ "$command" != "online" ]  &&
+   [ "$command" != "offline" ] &&
+   [ "$command" != "add" ]     &&
+   [ "$command" != "remove" ]
+then
+       log err "Invalid command: $command"
+       exit 1
+fi
+
+
+XENBUS_PATH="${XENBUS_PATH:?}"
+
+. "$dir/vtpm-common.sh"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-hotplug-cleanup
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/xen-hotplug-cleanup        Wed Mar  1 19:47:25 2006
@@ -0,0 +1,21 @@
+#! /bin/sh
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+# Claim the lock protecting /etc/xen/scripts/block.  This stops a race whereby
+# paths in the store would disappear underneath that script as it attempted to
+# read from the store checking for device sharing.
+# Any other scripts that do similar things will have to have their lock
+# claimed too.
+# This is pretty horrible, but there's not really a nicer way of solving this.
+claim_lock "block"
+
+# remove device frontend store entries
+xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true
+
+# remove device backend store entries
+xenstore-rm -t "$XENBUS_PATH"       || true
+xenstore-rm -t "error/$XENBUS_PATH" || true
+
+release_lock "block"
diff -r 88f97bb8f3ae -r 673f62edbfbe 
tools/xm-test/tests/vtpm/01_vtpm-list_pos.py
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/tests/vtpm/01_vtpm-list_pos.py      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+
+# Copyright (C) International Business Machines Corp., 2006
+# Author: Stefan Berger <stefanb@xxxxxxxxxx)
+
+# Positive Test: create domain with virtual TPM attached at build time,
+#                verify list
+
+
+from XmTestLib import *
+
+def vtpm_cleanup(domName):
+       # Since this is only a temporary domain I clean up the domain from the
+       # virtual TPM directory
+       traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName)
+
+if ENABLE_HVM_SUPPORT:
+    SKIP("vtpm-list not supported for HVM domains")
+
+config = {"vtpm":"instance=1,backend=0"}
+domain = XmTestDomain(extraConfig=config)
+
+try:
+    domain.start()
+except DomainError, e:
+    if verbose:
+        print e.extra
+    vtpm_cleanup(domain.getName())
+    FAIL("Unable to create domain")
+
+domName = domain.getName()
+
+status, output = traceCommand("xm vtpm-list %s" % domain.getId())
+eyecatcher = "/local/domain/0/backend/vtpm"
+where = output.find(eyecatcher)
+if status != 0:
+    vtpm_cleanup(domName)
+    FAIL("xm vtpm-list returned bad status, expected 0, status is %i" % status)
+elif where < 0:
+    vtpm_cleanup(domName)
+    FAIL("Fail to list virtual TPM device")
+
+domain.stop()
+
+vtpm_cleanup(domName)
diff -r 88f97bb8f3ae -r 673f62edbfbe 
tools/xm-test/tests/vtpm/02_vtpm-cat_pcrs.py
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/tests/vtpm/02_vtpm-cat_pcrs.py      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,81 @@
+#!/usr/bin/python
+
+# Copyright (C) International Business Machines Corp., 2006
+# Author: Stefan Berger <stefanb@xxxxxxxxxx)
+
+# Positive Test: create domain with virtual TPM attached at build time,
+#                check list of pcrs
+
+from XmTestLib import *
+
+def vtpm_cleanup(domName):
+       # Since this is only a temporary domain I clean up the domain from the
+       # virtual TPM directory
+       traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName)
+
+if ENABLE_HVM_SUPPORT:
+    SKIP("vtpm-list not supported for HVM domains")
+
+status, output = traceCommand("ls /dev/tpm0")
+if re.search("No such file or directory",output):
+    SKIP("This machine has no hardware TPM; cannot run this test")
+
+status, output = traceCommand("ps aux | grep vtpm_manager | grep -v grep")
+if output == "":
+    FAIL("virtual TPM manager must be started to run this test")
+
+# vtpm manager has been detected
+config = {"vtpm":"instance=1,backend=0"}
+domain = XmTestDomain(extraConfig=config)
+
+try:
+    domain.start()
+except DomainError, e:
+    if verbose:
+        print e.extra
+    vtpm_cleanup(domain.getName())
+    FAIL("Unable to create domain")
+
+domName = domain.getName()
+
+try:
+    console = XmConsole(domain.getName())
+except ConsoleError, e:
+    vtpm_cleanup(domName)
+    FAIL(str(e))
+
+try:
+    console.sendInput("input")
+    run = console.runCmd("ls /sys")
+except ConsoleError, e:
+    saveLog(console.getHistory())
+    vtpm_cleanup(domName)
+    FAIL(str(e))
+
+if re.search("No such file",run["output"]):
+    try:
+        run = console.runCmd("mkdir /sys")
+        run = console.runCmd("mount -t sysfs /sys /sys")
+    except ConsoleError, e:
+        saveLog(console.getHistory())
+        vtpm_cleanup(domName)
+        FAIL(str(e))
+
+try:
+    run = console.runCmd("cat /sys/devices/platform/tpm_vtpm/pcrs")
+except ConsoleError, e:
+    saveLog(console.getHistory())
+    vtpm_cleanup(domName)
+    FAIL(str(e))
+
+if re.search("No such file",run["output"]):
+    FAIL("TPM frontend support not compiled into (domU?) kernel")
+
+console.closeConsole()
+
+domain.stop()
+
+vtpm_cleanup(domName)
+
+if not re.search("PCR-00:",run["output"]):
+       FAIL("Virtual TPM is not working correctly on /dev/vtpm on backend 
side")
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/vtpm/Makefile.am
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/tests/vtpm/Makefile.am      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,22 @@
+
+SUBDIRS =
+
+TESTS = 01_vtpm-list_pos.test \
+        02_vtpm-cat_pcrs.test
+
+XFAIL_TESTS =
+
+EXTRA_DIST = $(TESTS) $(XFAIL_TESTS)
+
+TESTS_ENVIRONMENT=@TENV@
+
+%.test: %.py
+       cp $< $@
+       chmod +x $@
+
+clean-local: am_config_clean-local
+
+am_config_clean-local:
+       rm -f *test
+       rm -f *log
+       rm -f *~
diff -r 88f97bb8f3ae -r 673f62edbfbe 
xen/arch/x86/x86_32/supervisor_mode_kernel.S
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/supervisor_mode_kernel.S      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,145 @@
+/*
+ * Handle stack fixup for guest running in RING 0.
+ *
+ * Copyright (c) 2006 Ian Campbell
+ *
+ * When a guest kernel is allowed to run in RING 0 a hypercall,
+ * interrupt or exception interrupting the guest kernel will not cause
+ * a privilege level change and therefore the stack will not be swapped
+ * to the Xen stack.
+ *
+ * To fix this we look for RING 0 activation frames with a stack
+ * pointer below HYPERVISOR_VIRT_START (indicating a guest kernel
+ * frame) and fix this up by locating the Xen stack via the TSS
+ * and moving the activation frame to the Xen stack. In the process we
+ * convert the frame into an inter-privilege frame returning to RING 1
+ * so that we can catch and reverse the process on exit.
+ */
+
+#include <xen/config.h>
+#include <asm/asm_defns.h>
+#include <public/xen.h>
+
+        # Upon entry the stack should be the Xen stack and contain:
+        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, SAVE_ALL, RETURN
+        # On exit the stack should be %ss:%esp (i.e. the guest stack)
+        # and contain:
+        #   EFLAGS, %cs, %eip, ERROR, SAVE_ALL, RETURN
+        ALIGN
+ENTRY(restore_ring0_guest)
+        # Point %gs:%esi to guest stack.
+RRG0:   movw UREGS_ss+4(%esp),%gs
+        movl UREGS_esp+4(%esp),%esi
+
+        # Copy EFLAGS...EBX, RETURN from Xen stack to guest stack.
+        movl $(UREGS_kernel_sizeof>>2)+1,%ecx
+
+1:      subl $4,%esi
+        movl -4(%esp,%ecx,4),%eax
+RRG1:   movl %eax,%gs:(%esi)
+        loop 1b
+
+RRG2:   andl $~3,%gs:UREGS_cs+4(%esi)
+
+        movl %gs,%eax
+
+        # We need to do this because these registers are not present
+        # on the guest stack so they cannot be restored by the code in
+        # restore_all_guest.
+RRG3:   mov  UREGS_ds+4(%esp),%ds
+RRG4:   mov  UREGS_es+4(%esp),%es
+RRG5:   mov  UREGS_fs+4(%esp),%fs
+RRG6:   mov  UREGS_gs+4(%esp),%gs
+
+RRG7:   movl %eax,%ss
+        movl %esi,%esp
+
+        ret
+.section __ex_table,"a"
+        .long RRG0,domain_crash_synchronous
+        .long RRG1,domain_crash_synchronous
+        .long RRG2,domain_crash_synchronous
+        .long RRG3,domain_crash_synchronous
+        .long RRG4,domain_crash_synchronous
+        .long RRG5,domain_crash_synchronous
+        .long RRG6,domain_crash_synchronous
+        .long RRG7,domain_crash_synchronous
+.previous
+
+        # Upon entry the stack should be a guest stack and contain:
+        #   EFLAGS, %cs, %eip, ERROR, RETURN
+        # On exit the stack should be the Xen stack and contain:
+        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, RETURN
+        ALIGN
+ENTRY(fixup_ring0_guest_stack)
+        pushl %eax
+        pushl %ecx
+        pushl %ds
+        pushl %gs
+        pushl %esi
+
+        movw  $__HYPERVISOR_DS,%ax
+        movw  %ax,%ds
+
+        # Point %gs:%esi to guest stack frame.
+        movw  %ss,%ax
+        movw  %ax,%gs
+        movl  %esp,%esi
+        # Account for entries on the guest stack:
+        # * Pushed by normal exception/interrupt/hypercall mechanisms
+        #   * EFLAGS, %cs, %eip, ERROR == 4 words.
+        # * Pushed by the fixup routine
+        #   * [RETURN], %eax, %ecx, %ds, %gs and %esi == 6 words.
+        addl $((6+4)*4),%esi
+
+        # %gs:%esi now points to the guest stack before the
+        # interrupt/exception occured.
+
+        /*
+         * Reverse the __TSS macro, giving us the CPU number.
+         * The TSS for this cpu is at init_tss + ( cpu * 128 ).
+         */
+        str   %ecx
+        shrl  $3,%ecx                                   # Calculate GDT index 
for TSS.
+        subl  $(FIRST_RESERVED_GDT_ENTRY+8),%ecx        # %ecx = 2*cpu.
+        shll  $6,%ecx                                   # Each TSS entry is 
0x80 bytes
+        addl  $init_tss,%ecx                            # but we have 2*cpu 
from above.
+
+        # Load Xen stack from TSS.
+        movw  TSS_ss0(%ecx),%ax
+TRP1:   movw  %ax,%ss
+        movl  TSS_esp0(%ecx),%esp
+
+        pushl %gs
+        pushl %esi
+
+        # Move EFLAGS, %cs, %eip, ERROR, RETURN, %eax, %ecx, %ds, %gs, %esi
+        # from guest stack to Xen stack.
+        movl  $10,%ecx
+1:      subl  $4,%esp
+        subl  $4,%esi
+TRP2:   movl  %gs:(%esi),%eax
+        movl  %eax,(%esp)
+        loop  1b
+
+        # CS = CS|1 to simulate RING1 stack frame.
+        orl   $1,32(%esp)
+
+        popl  %esi
+        popl  %gs
+        popl  %ds
+        popl  %ecx
+        popl  %eax
+        ret
+.section __ex_table,"a"
+        .long TRP1,domain_crash_synchronous
+        .long TRP2,domain_crash_synchronous
+.previous
+
+domain_crash_synchronous_string:
+        .asciz "domain_crash_sync called from supervisor_mode_kernel.S (%lx)\n"
+
+domain_crash_synchronous:
+        pushl $domain_crash_synchronous_string
+        call  printf
+        jmp   __domain_crash_synchronous
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/uaccess.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-ia64/uaccess.h    Wed Mar  1 19:47:25 2006
@@ -0,0 +1,285 @@
+#ifndef _ASM_IA64_UACCESS_H
+#define _ASM_IA64_UACCESS_H
+
+/*
+ * This file defines various macros to transfer memory areas across
+ * the user/kernel boundary.  This needs to be done carefully because
+ * this code is executed in kernel mode and uses user-specified
+ * addresses.  Thus, we need to be careful not to let the user to
+ * trick us into accessing kernel memory that would normally be
+ * inaccessible.  This code is also fairly performance sensitive,
+ * so we want to spend as little time doing safety checks as
+ * possible.
+ *
+ * To make matters a bit more interesting, these macros sometimes also
+ * called from within the kernel itself, in which case the address
+ * validity check must be skipped.  The get_fs() macro tells us what
+ * to do: if get_fs()==USER_DS, checking is performed, if
+ * get_fs()==KERNEL_DS, checking is bypassed.
+ *
+ * Note that even if the memory area specified by the user is in a
+ * valid address range, it is still possible that we'll get a page
+ * fault while accessing it.  This is handled by filling out an
+ * exception handler fixup entry for each instruction that has the
+ * potential to fault.  When such a fault occurs, the page fault
+ * handler checks to see whether the faulting instruction has a fixup
+ * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
+ * then resumes execution at the continuation point.
+ *
+ * Based on <asm-alpha/uaccess.h>.
+ *
+ * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
+ *     David Mosberger-Tang <davidm@xxxxxxxxxx>
+ */
+
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h>
+
+#include <asm/intrinsics.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+
+#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1)
+#define __access_ok(addr) (!IS_VMM_ADDRESS((unsigned long)(addr)))
+#define access_ok(addr, size) (__access_ok(addr))
+#define array_access_ok(addr,count,size)( __access_ok(addr))
+
+/*
+ * These are the main single-value transfer routines.  They automatically
+ * use the right size if we just have the right pointer type.
+ *
+ * Careful to not
+ * (a) re-use the arguments for side effects (sizeof/typeof is ok)
+ * (b) require any knowledge of processes at this stage
+ */
+#define put_user(x, ptr)       __put_user_check((__typeof__(*(ptr))) (x), 
(ptr), sizeof(*(ptr)), get_fs())
+#define get_user(x, ptr)       __get_user_check((x), (ptr), sizeof(*(ptr)), 
get_fs())
+
+/*
+ * The "__xxx" versions do not do address space checking, useful when
+ * doing multiple accesses to the same area (the programmer has to do the
+ * checks by hand with "access_ok()")
+ */
+#define __put_user(x, ptr)     __put_user_nocheck((__typeof__(*(ptr))) (x), 
(ptr), sizeof(*(ptr)))
+#define __get_user(x, ptr)     __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
+
+extern long __put_user_unaligned_unknown (void);
+
+#define __put_user_unaligned(x, ptr)                                           
                \
+({                                                                             
                \
+       long __ret;                                                             
                \
+       switch (sizeof(*(ptr))) {                                               
                \
+               case 1: __ret = __put_user((x), (ptr)); break;                  
                \
+               case 2: __ret = (__put_user((x), (u8 __user *)(ptr)))           
                \
+                       | (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1))); 
break;              \
+               case 4: __ret = (__put_user((x), (u16 __user *)(ptr)))          
                \
+                       | (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1))); 
break;            \
+               case 8: __ret = (__put_user((x), (u32 __user *)(ptr)))          
                \
+                       | (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1))); 
break;            \
+               default: __ret = __put_user_unaligned_unknown();                
                \
+       }                                                                       
                \
+       __ret;                                                                  
                \
+})
+
+extern long __get_user_unaligned_unknown (void);
+
+#define __get_user_unaligned(x, ptr)                                           
                \
+({                                                                             
                \
+       long __ret;                                                             
                \
+       switch (sizeof(*(ptr))) {                                               
                \
+               case 1: __ret = __get_user((x), (ptr)); break;                  
                \
+               case 2: __ret = (__get_user((x), (u8 __user *)(ptr)))           
                \
+                       | (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1))); 
break;              \
+               case 4: __ret = (__get_user((x), (u16 __user *)(ptr)))          
                \
+                       | (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1))); 
break;            \
+               case 8: __ret = (__get_user((x), (u32 __user *)(ptr)))          
                \
+                       | (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1))); 
break;            \
+               default: __ret = __get_user_unaligned_unknown();                
                \
+       }                                                                       
                \
+       __ret;                                                                  
                \
+})
+
+#ifdef ASM_SUPPORTED
+  struct __large_struct { unsigned long buf[100]; };
+# define __m(x) (*(struct __large_struct __user *)(x))
+
+/* We need to declare the __ex_table section before we can use it in .xdata.  
*/
+asm (".section \"__ex_table\", \"a\"\n\t.previous");
+
+# define __get_user_size(val, addr, n, err)                                    
                \
+do {                                                                           
                \
+       register long __gu_r8 asm ("r8") = 0;                                   
                \
+       register long __gu_r9 asm ("r9");                                       
                \
+       asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by 
exception handler\n"     \
+            "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"                         
                \
+            "[1:]"                                                             
                \
+            : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));    
                \
+       (err) = __gu_r8;                                                        
                \
+       (val) = __gu_r9;                                                        
                \
+} while (0)
+
+/*
+ * The "__put_user_size()" macro tells gcc it reads from memory instead of 
writing it.  This
+ * is because they do not write to any memory gcc knows about, so there are no 
aliasing
+ * issues.
+ */
+# define __put_user_size(val, addr, n, err)                                    
                \
+do {                                                                           
                \
+       register long __pu_r8 asm ("r8") = 0;                                   
                \
+       asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by 
exception handler\n" \
+                     "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"                  
                \
+                     "[1:]"                                                    
                \
+                     : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), 
"0"(__pu_r8));               \
+       (err) = __pu_r8;                                                        
                \
+} while (0)
+
+#else /* !ASM_SUPPORTED */
+# define RELOC_TYPE    2       /* ip-rel */
+# define __get_user_size(val, addr, n, err)                            \
+do {                                                                   \
+       __ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE);   \
+       (err) = ia64_getreg(_IA64_REG_R8);                              \
+       (val) = ia64_getreg(_IA64_REG_R9);                              \
+} while (0)
+# define __put_user_size(val, addr, n, err)                                    
                \
+do {                                                                           
                \
+       __st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned 
long) (val));    \
+       (err) = ia64_getreg(_IA64_REG_R8);                                      
                \
+} while (0)
+#endif /* !ASM_SUPPORTED */
+
+extern void __get_user_unknown (void);
+
+/*
+ * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve 
subroutine-calls, which
+ * could clobber r8 and r9 (among others).  Thus, be careful not to evaluate 
it while
+ * using r8/r9.
+ */
+#define __do_get_user(check, x, ptr, size, segment)                            
        \
+({                                                                             
        \
+       const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);                      
        \
+       __typeof__ (size) __gu_size = (size);                                   
        \
+       long __gu_err = -EFAULT, __gu_val = 0;                                  
        \
+                                                                               
        \
+       if (!check || __access_ok(__gu_ptr))                                    
        \
+               switch (__gu_size) {                                            
        \
+                     case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); 
break;  \
+                     case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); 
break;  \
+                     case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); 
break;  \
+                     case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); 
break;  \
+                     default: __get_user_unknown(); break;                     
        \
+               }                                                               
        \
+       (x) = (__typeof__(*(__gu_ptr))) __gu_val;                               
        \
+       __gu_err;                                                               
        \
+})
+
+#define __get_user_nocheck(x, ptr, size)       __do_get_user(0, x, ptr, size, 
KERNEL_DS)
+#define __get_user_check(x, ptr, size, segment)        __do_get_user(1, x, 
ptr, size, segment)
+
+extern void __put_user_unknown (void);
+
+/*
+ * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve 
subroutine-calls, which
+ * could clobber r8 (among others).  Thus, be careful not to evaluate them 
while using r8.
+ */
+#define __do_put_user(check, x, ptr, size, segment)                            
        \
+({                                                                             
        \
+       __typeof__ (x) __pu_x = (x);                                            
        \
+       __typeof__ (*(ptr)) __user *__pu_ptr = (ptr);                           
        \
+       __typeof__ (size) __pu_size = (size);                                   
        \
+       long __pu_err = -EFAULT;                                                
        \
+                                                                               
        \
+       if (!check || __access_ok(__pu_ptr))                                    
        \
+               switch (__pu_size) {                                            
        \
+                     case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); 
break;    \
+                     case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); 
break;    \
+                     case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); 
break;    \
+                     case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); 
break;    \
+                     default: __put_user_unknown(); break;                     
        \
+               }                                                               
        \
+       __pu_err;                                                               
        \
+})
+
+#define __put_user_nocheck(x, ptr, size)       __do_put_user(0, x, ptr, size, 
KERNEL_DS)
+#define __put_user_check(x, ptr, size, segment)        __do_put_user(1, x, 
ptr, size, segment)
+
+/*
+ * Complex access routines
+ */
+extern unsigned long __must_check __copy_user (void __user *to, const void 
__user *from,
+                                              unsigned long count);
+
+static inline unsigned long
+__copy_to_user (void __user *to, const void *from, unsigned long count)
+{
+       return __copy_user(to, (void __user *) from, count);
+}
+
+static inline unsigned long
+__copy_from_user (void *to, const void __user *from, unsigned long count)
+{
+       return __copy_user((void __user *) to, from, count);
+}
+
+#define __copy_to_user_inatomic                __copy_to_user
+#define __copy_from_user_inatomic      __copy_from_user
+#define copy_to_user(to, from, n)                                              
        \
+({                                                                             
        \
+       void __user *__cu_to = (to);                                            
        \
+       const void *__cu_from = (from);                                         
        \
+       long __cu_len = (n);                                                    
        \
+                                                                               
        \
+       if (__access_ok(__cu_to))                                               
        \
+               __cu_len = __copy_user(__cu_to, (void __user *) __cu_from, 
__cu_len);   \
+       __cu_len;                                                               
        \
+})
+
+#define copy_from_user(to, from, n)                                            
        \
+({                                                                             
        \
+       void *__cu_to = (to);                                                   
        \
+       const void __user *__cu_from = (from);                                  
        \
+       long __cu_len = (n);                                                    
        \
+                                                                               
        \
+       __chk_user_ptr(__cu_from);                                              
        \
+       if (__access_ok(__cu_from))                                             
        \
+               __cu_len = __copy_user((void __user *) __cu_to, __cu_from, 
__cu_len);   \
+       __cu_len;                                                               
        \
+})
+
+#define __copy_in_user(to, from, size) __copy_user((to), (from), (size))
+
+static inline unsigned long
+copy_in_user (void __user *to, const void __user *from, unsigned long n)
+{
+       if (likely(access_ok(from, n) && access_ok(to, n)))
+               n = __copy_user(to, from, n);
+       return n;
+}
+
+#define ARCH_HAS_SORT_EXTABLE
+#define ARCH_HAS_SEARCH_EXTABLE
+
+struct exception_table_entry {
+       int addr;       /* location-relative address of insn this fixup is for 
*/
+       int cont;       /* location-relative continuation addr.; if bit 2 is 
set, r9 is set to 0 */
+};
+
+extern void ia64_handle_exception (struct pt_regs *regs, const struct 
exception_table_entry *e);
+extern const struct exception_table_entry *search_exception_tables (unsigned 
long addr);
+
+static inline int
+ia64_done_with_exception (struct pt_regs *regs)
+{
+       const struct exception_table_entry *e;
+       e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
+       if (e) {
+               ia64_handle_exception(regs, e);
+               return 1;
+       }
+       return 0;
+}
+
+#endif /* _ASM_IA64_UACCESS_H */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/features.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/features.h     Wed Mar  1 19:47:25 2006
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * features.h
+ * 
+ * Feature flags, reported by XENVER_get_features.
+ * 
+ * Copyright (c) 2006, Keir Fraser <keir@xxxxxxxxxxxxx>
+ */
+
+#ifndef __XEN_PUBLIC_FEATURES_H__
+#define __XEN_PUBLIC_FEATURES_H__
+
+/*
+ * If set, the guest does not need to write-protect its pagetables, and can
+ * update them via direct writes.
+ */
+#define XENFEAT_writable_page_tables       0
+
+/*
+ * If set, the guest does not need to write-protect its segment descriptor
+ * tables, and can update them via direct writes.
+ */
+#define XENFEAT_writable_descriptor_tables 1
+
+/*
+ * If set, translation between the guest's 'pseudo-physical' address space
+ * and the host's machine address space are handled by the hypervisor. In this
+ * mode the guest does not need to perform phys-to/from-machine translations
+ * when performing page table operations.
+ */
+#define XENFEAT_auto_translated_physmap    2
+
+/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
+#define XENFEAT_supervisor_mode_kernel     3
+
+/*
+ * If set, the guest does not need to allocate x86 PAE page directories
+ * below 4GB. This flag is usually implied by auto_translated_physmap.
+ */
+#define XENFEAT_pae_pgdir_above_4gb        4
+
+#define XENFEAT_NR_SUBMAPS 1
+
+#endif /* __XEN_PUBLIC_FEATURES_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/guest_access.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/xen/include/xen/guest_access.h    Wed Mar  1 19:47:25 2006
@@ -0,0 +1,71 @@
+/******************************************************************************
+ * guest_access.h
+ * 
+ * Copyright (x) 2006, K A Fraser
+ */
+
+#ifndef __XEN_GUEST_ACCESS_H__
+#define __XEN_GUEST_ACCESS_H__
+
+#include <asm/uaccess.h>
+
+/* Is the guest handle a NULL reference? */
+#define guest_handle_is_null(hnd)        ((hnd).p == NULL)
+
+/* Offset the given guest handle into the array it refers to. */
+#define guest_handle_add_offset(hnd, nr) ((hnd).p += (nr))
+
+/* Cast a guest handle to the specified type of handle. */
+#define guest_handle_cast(hnd, type) ({         \
+    type *_x = (hnd).p;                         \
+    (GUEST_HANDLE(type)) { _x };                \
+})
+
+/*
+ * Copy an array of objects to guest context via a guest handle.
+ * Optionally specify an offset into the guest array.
+ */
+#define copy_to_guest_offset(hnd, off, ptr, nr) ({      \
+    const typeof(ptr) _x = (hnd).p;                     \
+    const typeof(ptr) _y = (ptr);                       \
+    copy_to_user(_x+(off), _y, sizeof(*_x)*(nr));       \
+})
+#define copy_to_guest(hnd, ptr, nr)                     \
+    copy_to_guest_offset(hnd, 0, ptr, nr)
+
+/*
+ * Copy an array of objects from guest context via a guest handle.
+ * Optionally specify an offset into the guest array.
+ */
+#define copy_from_guest_offset(ptr, hnd, off, nr) ({    \
+    const typeof(ptr) _x = (hnd).p;                     \
+    const typeof(ptr) _y = (ptr);                       \
+    copy_from_user(_y, _x+(off), sizeof(*_x)*(nr));     \
+})
+#define copy_from_guest(ptr, hnd, nr)                   \
+    copy_from_guest_offset(ptr, hnd, 0, nr)
+
+/*
+ * Pre-validate a guest handle.
+ * Allows use of faster __copy_* functions.
+ */
+#define guest_handle_okay(hnd, nr)                      \
+    array_access_ok((hnd).p, (nr), sizeof(*(hnd).p))
+
+#define __copy_to_guest_offset(hnd, off, ptr, nr) ({    \
+    const typeof(ptr) _x = (hnd).p;                     \
+    const typeof(ptr) _y = (ptr);                       \
+    __copy_to_user(_x+(off), _y, sizeof(*_x)*(nr));     \
+})
+#define __copy_to_guest(hnd, ptr, nr)                   \
+    __copy_to_guest_offset(hnd, 0, ptr, nr)
+
+#define __copy_from_guest_offset(ptr, hnd, off, nr) ({  \
+    const typeof(ptr) _x = (hnd).p;                     \
+    const typeof(ptr) _y = (ptr);                       \
+    __copy_from_user(_y, _x+(off), sizeof(*_x)*(nr));   \
+})
+#define __copy_from_guest(ptr, hnd, nr)                 \
+    __copy_from_guest_offset(ptr, hnd, 0, nr)
+
+#endif /* __XEN_GUEST_ACCESS_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe 
patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch
--- a/patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch     Wed Mar  1 
17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,45 +0,0 @@
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c 
./arch/i386/kernel/traps.c
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c      2006-02-15 
20:38:51.000000000 +0000
-+++ ./arch/i386/kernel/traps.c 2006-02-15 20:40:43.000000000 +0000
-@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch
- 
- static void io_check_error(unsigned char reason, struct pt_regs * regs)
- {
--      unsigned long i;
--
-       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
-       show_registers(regs);
- 
-       /* Re-enable the IOCK line, wait for a few seconds */
--      reason = (reason & 0xf) | 8;
--      outb(reason, 0x61);
--      i = 2000;
--      while (--i) udelay(1000);
--      reason &= ~8;
--      outb(reason, 0x61);
-+      clear_io_check_error(reason);
- }
- 
- static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-diff -pruN 
../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h 
./include/asm-i386/mach-default/mach_traps.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h    
2006-01-03 03:21:10.000000000 +0000
-+++ ./include/asm-i386/mach-default/mach_traps.h       2006-02-15 
20:40:43.000000000 +0000
-@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
-       outb(reason, 0x61);
- }
- 
-+static inline void clear_io_check_error(unsigned char reason)
-+{
-+      unsigned long i;
-+
-+      reason = (reason & 0xf) | 8;
-+      outb(reason, 0x61);
-+      i = 2000;
-+      while (--i) udelay(1000);
-+      reason &= ~8;
-+      outb(reason, 0x61);
-+}
-+
- static inline unsigned char get_nmi_reason(void)
- {
-       return inb(0x61);
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/net-csum.patch
--- a/patches/linux-2.6.16-rc4/net-csum.patch   Wed Mar  1 17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,41 +0,0 @@
-diff -pruN 
../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c 
./net/ipv4/netfilter/ip_nat_proto_tcp.c
---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c    
2006-02-02 17:39:51.000000000 +0000
-+++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c    2006-02-02 17:44:18.000000000 
+0000
-@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb,
-       if (hdrsize < sizeof(*hdr))
-               return 1;
- 
--      hdr->check = ip_nat_cheat_check(~oldip, newip,
-+      if ((*pskb)->proto_csum_blank) {
-+              hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
-+      } else {
-+              hdr->check = ip_nat_cheat_check(~oldip, newip,
-                                       ip_nat_cheat_check(oldport ^ 0xFFFF,
-                                                          newport,
-                                                          hdr->check));
-+      }
-       return 1;
- }
-
-diff -pruN 
../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c 
./net/ipv4/netfilter/ip_nat_proto_udp.c
---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c    
2006-02-02 17:39:51.000000000 +0000
-+++ ./net/ipv4/netfilter/ip_nat_proto_udp.c    2006-02-02 17:44:18.000000000 
+0000
-@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb,
-               newport = tuple->dst.u.udp.port;
-               portptr = &hdr->dest;
-       }
--      if (hdr->check) /* 0 is a special case meaning no checksum */
--              hdr->check = ip_nat_cheat_check(~oldip, newip,
-+      if (hdr->check) { /* 0 is a special case meaning no checksum */
-+              if ((*pskb)->proto_csum_blank) {
-+                      hdr->check = ip_nat_cheat_check(oldip, ~newip, 
hdr->check);
-+              } else {
-+                      hdr->check = ip_nat_cheat_check(~oldip, newip,
-                                       ip_nat_cheat_check(*portptr ^ 0xFFFF,
-                                                          newport,
-                                                          hdr->check));
-+              }
-+      }
-       *portptr = newport;
-       return 1;
- }
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/pmd-shared.patch
--- a/patches/linux-2.6.16-rc4/pmd-shared.patch Wed Mar  1 17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,111 +0,0 @@
-diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c 
./arch/i386/mm/pageattr.c
---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c  2006-02-02 
17:39:29.000000000 +0000
-+++ ./arch/i386/mm/pageattr.c  2006-02-02 17:45:14.000000000 +0000
-@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns
-       unsigned long flags;
- 
-       set_pte_atomic(kpte, pte);      /* change init_mm */
--      if (PTRS_PER_PMD > 1)
-+      if (HAVE_SHARED_KERNEL_PMD)
-               return;
- 
-       spin_lock_irqsave(&pgd_lock, flags);
-diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c 
./arch/i386/mm/pgtable.c
---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c   2006-01-03 
03:21:10.000000000 +0000
-+++ ./arch/i386/mm/pgtable.c   2006-02-02 17:45:14.000000000 +0000
-@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
-               spin_lock_irqsave(&pgd_lock, flags);
-       }
- 
--      clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
--                      swapper_pg_dir + USER_PTRS_PER_PGD,
--                      KERNEL_PGD_PTRS);
-+      if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD)
-+              clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
-+                              swapper_pg_dir + USER_PTRS_PER_PGD,
-+                              KERNEL_PGD_PTRS);
-       if (PTRS_PER_PMD > 1)
-               return;
- 
-@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
-                       goto out_oom;
-               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
-       }
-+
-+      if (!HAVE_SHARED_KERNEL_PMD) {
-+              unsigned long flags;
-+
-+              for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-+                      pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
-+                      if (!pmd)
-+                              goto out_oom;
-+                      set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
-+              }
-+
-+              spin_lock_irqsave(&pgd_lock, flags);
-+              for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-+                      unsigned long v = (unsigned long)i << PGDIR_SHIFT;
-+                      pgd_t *kpgd = pgd_offset_k(v);
-+                      pud_t *kpud = pud_offset(kpgd, v);
-+                      pmd_t *kpmd = pmd_offset(kpud, v);
-+                      pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-+                      memcpy(pmd, kpmd, PAGE_SIZE);
-+              }
-+              pgd_list_add(pgd);
-+              spin_unlock_irqrestore(&pgd_lock, flags);
-+      }
-+
-       return pgd;
- 
- out_oom:
-@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd)
-       int i;
- 
-       /* in the PAE case user pgd entries are overwritten before usage */
--      if (PTRS_PER_PMD > 1)
--              for (i = 0; i < USER_PTRS_PER_PGD; ++i)
--                      kmem_cache_free(pmd_cache, (void 
*)__va(pgd_val(pgd[i])-1));
-+      if (PTRS_PER_PMD > 1) {
-+              for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-+                      pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-+                      kmem_cache_free(pmd_cache, pmd);
-+              }
-+              if (!HAVE_SHARED_KERNEL_PMD) {
-+                      unsigned long flags;
-+                      spin_lock_irqsave(&pgd_lock, flags);
-+                      pgd_list_del(pgd);
-+                      spin_unlock_irqrestore(&pgd_lock, flags);
-+                      for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-+                              pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-+                              memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-+                              kmem_cache_free(pmd_cache, pmd);
-+                      }
-+              }
-+      }
-       /* in the non-PAE case, free_pgtables() clears user pgd entries */
-       kmem_cache_free(pgd_cache, pgd);
- }
-diff -pruN 
../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h 
./include/asm-i386/pgtable-2level-defs.h
---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h   
2006-01-03 03:21:10.000000000 +0000
-+++ ./include/asm-i386/pgtable-2level-defs.h   2006-02-02 17:45:14.000000000 
+0000
-@@ -1,6 +1,8 @@
- #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
- #define _I386_PGTABLE_2LEVEL_DEFS_H
- 
-+#define HAVE_SHARED_KERNEL_PMD 0
-+
- /*
-  * traditional i386 two-level paging structure:
-  */
-diff -pruN 
../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h 
./include/asm-i386/pgtable-3level-defs.h
---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h   
2006-01-03 03:21:10.000000000 +0000
-+++ ./include/asm-i386/pgtable-3level-defs.h   2006-02-02 17:45:14.000000000 
+0000
-@@ -1,6 +1,8 @@
- #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
- #define _I386_PGTABLE_3LEVEL_DEFS_H
- 
-+#define HAVE_SHARED_KERNEL_PMD 1
-+
- /*
-  * PGDIR_SHIFT determines what a top-level page table entry can map
-  */
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/smp-alts.patch
--- a/patches/linux-2.6.16-rc4/smp-alts.patch   Wed Mar  1 17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,591 +0,0 @@
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig ./arch/i386/Kconfig
---- ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig     2006-02-15 
20:38:51.000000000 +0000
-+++ ./arch/i386/Kconfig        2006-02-15 20:45:57.000000000 +0000
-@@ -202,6 +202,19 @@ config SMP
- 
-         If you don't know what to do here, say N.
- 
-+config SMP_ALTERNATIVES
-+      bool "SMP alternatives support (EXPERIMENTAL)"
-+      depends on SMP && EXPERIMENTAL
-+      help
-+        Try to reduce the overhead of running an SMP kernel on a uniprocessor
-+        host slightly by replacing certain key instruction sequences
-+        according to whether we currently have more than one CPU available.
-+        This should provide a noticeable boost to performance when
-+        running SMP kernels on UP machines, and have negligible impact
-+        when running on an true SMP host.
-+
-+          If unsure, say N.
-+        
- config NR_CPUS
-       int "Maximum number of CPUs (2-255)"
-       range 2 255
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile 
./arch/i386/kernel/Makefile
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile     2006-02-15 
20:38:51.000000000 +0000
-+++ ./arch/i386/kernel/Makefile        2006-02-15 20:45:57.000000000 +0000
-@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI)            += efi.o efi_stub.o
- obj-$(CONFIG_DOUBLEFAULT)     += doublefault.o
- obj-$(CONFIG_VM86)            += vm86.o
- obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
-+obj-$(CONFIG_SMP_ALTERNATIVES)  += smpalts.o
- 
- EXTRA_AFLAGS   := -traditional
- 
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c 
./arch/i386/kernel/smpalts.c
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c    1970-01-01 
01:00:00.000000000 +0100
-+++ ./arch/i386/kernel/smpalts.c       2006-02-15 20:45:57.000000000 +0000
-@@ -0,0 +1,85 @@
-+#include <linux/kernel.h>
-+#include <asm/system.h>
-+#include <asm/smp_alt.h>
-+#include <asm/processor.h>
-+#include <asm/string.h>
-+
-+struct smp_replacement_record {
-+      unsigned char targ_size;
-+      unsigned char smp1_size;
-+      unsigned char smp2_size;
-+      unsigned char up_size;
-+      unsigned char feature;
-+      unsigned char data[0];
-+};
-+
-+struct smp_alternative_record {
-+      void *targ_start;
-+      struct smp_replacement_record *repl;
-+};
-+
-+extern struct smp_alternative_record __start_smp_alternatives_table,
-+  __stop_smp_alternatives_table;
-+extern unsigned long __init_begin, __init_end;
-+
-+void prepare_for_smp(void)
-+{
-+      struct smp_alternative_record *r;
-+      printk(KERN_INFO "Enabling SMP...\n");
-+      for (r = &__start_smp_alternatives_table;
-+           r != &__stop_smp_alternatives_table;
-+           r++) {
-+              BUG_ON(r->repl->targ_size < r->repl->smp1_size);
-+              BUG_ON(r->repl->targ_size < r->repl->smp2_size);
-+              BUG_ON(r->repl->targ_size < r->repl->up_size);
-+               if (system_state == SYSTEM_RUNNING &&
-+                   r->targ_start >= (void *)&__init_begin &&
-+                   r->targ_start < (void *)&__init_end)
-+                       continue;
-+              if (r->repl->feature != (unsigned char)-1 &&
-+                  boot_cpu_has(r->repl->feature)) {
-+                      memcpy(r->targ_start,
-+                             r->repl->data + r->repl->smp1_size,
-+                             r->repl->smp2_size);
-+                      memset(r->targ_start + r->repl->smp2_size,
-+                             0x90,
-+                             r->repl->targ_size - r->repl->smp2_size);
-+              } else {
-+                      memcpy(r->targ_start,
-+                             r->repl->data,
-+                             r->repl->smp1_size);
-+                      memset(r->targ_start + r->repl->smp1_size,
-+                             0x90,
-+                             r->repl->targ_size - r->repl->smp1_size);
-+              }
-+      }
-+      /* Paranoia */
-+      asm volatile ("jmp 1f\n1:");
-+      mb();
-+}
-+
-+void unprepare_for_smp(void)
-+{
-+      struct smp_alternative_record *r;
-+      printk(KERN_INFO "Disabling SMP...\n");
-+      for (r = &__start_smp_alternatives_table;
-+           r != &__stop_smp_alternatives_table;
-+           r++) {
-+              BUG_ON(r->repl->targ_size < r->repl->smp1_size);
-+              BUG_ON(r->repl->targ_size < r->repl->smp2_size);
-+              BUG_ON(r->repl->targ_size < r->repl->up_size);
-+               if (system_state == SYSTEM_RUNNING &&
-+                   r->targ_start >= (void *)&__init_begin &&
-+                   r->targ_start < (void *)&__init_end)
-+                       continue;
-+              memcpy(r->targ_start,
-+                     r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
-+                     r->repl->up_size);
-+              memset(r->targ_start + r->repl->up_size,
-+                     0x90,
-+                     r->repl->targ_size - r->repl->up_size);
-+      }
-+      /* Paranoia */
-+      asm volatile ("jmp 1f\n1:");
-+      mb();
-+}
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c 
./arch/i386/kernel/smpboot.c
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c    2006-02-15 
20:38:51.000000000 +0000
-+++ ./arch/i386/kernel/smpboot.c       2006-02-15 20:45:57.000000000 +0000
-@@ -1214,6 +1214,11 @@ static void __init smp_boot_cpus(unsigne
-               if (max_cpus <= cpucount+1)
-                       continue;
- 
-+#ifdef CONFIG_SMP_ALTERNATIVES
-+              if (kicked == 1)
-+                      prepare_for_smp();
-+#endif
-+
-               if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
-                       printk("CPU #%d not responding - cannot use it.\n",
-                                                               apicid);
-@@ -1392,6 +1397,11 @@ int __devinit __cpu_up(unsigned int cpu)
-               return -EIO;
-       }
- 
-+#ifdef CONFIG_SMP_ALTERNATIVES
-+      if (num_online_cpus() == 1)
-+              prepare_for_smp();
-+#endif
-+
-       local_irq_enable();
-       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-       /* Unleash the CPU! */
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S 
./arch/i386/kernel/vmlinux.lds.S
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S        
2006-01-03 03:21:10.000000000 +0000
-+++ ./arch/i386/kernel/vmlinux.lds.S   2006-02-15 20:45:57.000000000 +0000
-@@ -34,6 +34,13 @@ SECTIONS
-   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
-   __stop___ex_table = .;
- 
-+  . = ALIGN(16);
-+  __start_smp_alternatives_table = .;
-+  __smp_alternatives : { *(__smp_alternatives) }
-+  __stop_smp_alternatives_table = .;
-+
-+  __smp_replacements : { *(__smp_replacements) }
-+
-   RODATA
- 
-   /* writeable */
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h 
./include/asm-i386/atomic.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h     2006-02-15 
20:38:57.000000000 +0000
-+++ ./include/asm-i386/atomic.h        2006-02-15 20:45:57.000000000 +0000
-@@ -4,18 +4,13 @@
- #include <linux/config.h>
- #include <linux/compiler.h>
- #include <asm/processor.h>
-+#include <asm/smp_alt.h>
- 
- /*
-  * Atomic operations that C can't guarantee us.  Useful for
-  * resource counting etc..
-  */
- 
--#ifdef CONFIG_SMP
--#define LOCK "lock ; "
--#else
--#define LOCK ""
--#endif
--
- /*
-  * Make sure gcc doesn't try to be clever and move things around
-  * on us. We need to use _exactly_ the address the user gave us,
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h 
./include/asm-i386/bitops.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h     2006-02-15 
20:38:57.000000000 +0000
-+++ ./include/asm-i386/bitops.h        2006-02-15 20:45:57.000000000 +0000
-@@ -7,6 +7,7 @@
- 
- #include <linux/config.h>
- #include <linux/compiler.h>
-+#include <asm/smp_alt.h>
- 
- /*
-  * These have to be done with inline assembly: that way the bit-setting
-@@ -16,12 +17,6 @@
-  * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
-  */
- 
--#ifdef CONFIG_SMP
--#define LOCK_PREFIX "lock ; "
--#else
--#define LOCK_PREFIX ""
--#endif
--
- #define ADDR (*(volatile long *) addr)
- 
- /**
-@@ -41,7 +36,7 @@
-  */
- static inline void set_bit(int nr, volatile unsigned long * addr)
- {
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btsl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol
-  */
- static inline void clear_bit(int nr, volatile unsigned long * addr)
- {
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btrl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-@@ -121,7 +116,7 @@ static inline void __change_bit(int nr, 
-  */
- static inline void change_bit(int nr, volatile unsigned long * addr)
- {
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btcl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n
- {
-       int oldbit;
- 
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btsl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int
- {
-       int oldbit;
- 
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btrl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in
- {
-       int oldbit;
- 
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btcl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h 
./include/asm-i386/futex.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h      2006-02-15 
20:38:57.000000000 +0000
-+++ ./include/asm-i386/futex.h 2006-02-15 20:45:57.000000000 +0000
-@@ -28,7 +28,7 @@
- "1:   movl    %2, %0\n\
-       movl    %0, %3\n"                                       \
-       insn "\n"                                               \
--"2:   " LOCK_PREFIX "cmpxchgl %3, %2\n\
-+"2:   " LOCK "cmpxchgl %3, %2\n\
-       jnz     1b\n\
- 3:    .section .fixup,\"ax\"\n\
- 4:    mov     %5, %1\n\
-@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op, 
- #endif
-               switch (op) {
-               case FUTEX_OP_ADD:
--                      __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
-+                      __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
-                                          oldval, uaddr, oparg);
-                       break;
-               case FUTEX_OP_OR:
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h 
./include/asm-i386/rwsem.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h      2006-01-03 
03:21:10.000000000 +0000
-+++ ./include/asm-i386/rwsem.h 2006-02-15 20:45:57.000000000 +0000
-@@ -40,6 +40,7 @@
- 
- #include <linux/list.h>
- #include <linux/spinlock.h>
-+#include <asm/smp_alt.h>
- 
- struct rwsem_waiter;
- 
-@@ -99,7 +100,7 @@ static inline void __down_read(struct rw
- {
-       __asm__ __volatile__(
-               "# beginning down_read\n\t"
--LOCK_PREFIX   "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old 
value */
-+LOCK          "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old 
value */
-               "  js        2f\n\t" /* jump if we weren't granted the lock */
-               "1:\n\t"
-               LOCK_SECTION_START("")
-@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st
-               "  movl      %1,%2\n\t"
-               "  addl      %3,%2\n\t"
-               "  jle       2f\n\t"
--LOCK_PREFIX   "  cmpxchgl  %2,%0\n\t"
-+LOCK          "  cmpxchgl  %2,%0\n\t"
-               "  jnz       1b\n\t"
-               "2:\n\t"
-               "# ending __down_read_trylock\n\t"
-@@ -150,7 +151,7 @@ static inline void __down_write(struct r
-       tmp = RWSEM_ACTIVE_WRITE_BIAS;
-       __asm__ __volatile__(
-               "# beginning down_write\n\t"
--LOCK_PREFIX   "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns 
the old value */
-+LOCK          "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns 
the old value */
-               "  testl     %%edx,%%edx\n\t" /* was the count 0 before? */
-               "  jnz       2f\n\t" /* jump if we weren't granted the lock */
-               "1:\n\t"
-@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s
-       __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
-       __asm__ __volatile__(
-               "# beginning __up_read\n\t"
--LOCK_PREFIX   "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old 
value */
-+LOCK          "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old 
value */
-               "  js        2f\n\t" /* jump if the lock is being waited upon */
-               "1:\n\t"
-               LOCK_SECTION_START("")
-@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_
-       __asm__ __volatile__(
-               "# beginning __up_write\n\t"
-               "  movl      %2,%%edx\n\t"
--LOCK_PREFIX   "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 
0xffff0001 -> 0x00000000 */
-+LOCK          "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 
0xffff0001 -> 0x00000000 */
-               "  jnz       2f\n\t" /* jump if the lock is being waited upon */
-               "1:\n\t"
-               LOCK_SECTION_START("")
-@@ -239,7 +240,7 @@ static inline void __downgrade_write(str
- {
-       __asm__ __volatile__(
-               "# beginning __downgrade_write\n\t"
--LOCK_PREFIX   "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 
0xYYYY0001 */
-+LOCK          "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 
0xYYYY0001 */
-               "  js        2f\n\t" /* jump if the lock is being waited upon */
-               "1:\n\t"
-               LOCK_SECTION_START("")
-@@ -263,7 +264,7 @@ LOCK_PREFIX        "  addl      %2,(%%eax)\n\t"
- static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
- {
-       __asm__ __volatile__(
--LOCK_PREFIX   "addl %1,%0"
-+LOCK            "addl %1,%0"
-               : "=m"(sem->count)
-               : "ir"(delta), "m"(sem->count));
- }
-@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in
-       int tmp = delta;
- 
-       __asm__ __volatile__(
--LOCK_PREFIX   "xadd %0,(%2)"
-+LOCK                    "xadd %0,(%2)"
-               : "+r"(tmp), "=m"(sem->count)
-               : "r"(sem), "m"(sem->count)
-               : "memory");
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h 
./include/asm-i386/smp_alt.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h    1970-01-01 
01:00:00.000000000 +0100
-+++ ./include/asm-i386/smp_alt.h       2006-02-15 20:45:57.000000000 +0000
-@@ -0,0 +1,32 @@
-+#ifndef __ASM_SMP_ALT_H__
-+#define __ASM_SMP_ALT_H__
-+
-+#include <linux/config.h>
-+
-+#ifdef CONFIG_SMP
-+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
-+#define LOCK \
-+        "6677: nop\n" \
-+      ".section __smp_alternatives,\"a\"\n" \
-+      ".long 6677b\n" \
-+      ".long 6678f\n" \
-+      ".previous\n" \
-+      ".section __smp_replacements,\"a\"\n" \
-+      "6678: .byte 1\n" \
-+      ".byte 1\n" \
-+      ".byte 0\n" \
-+        ".byte 1\n" \
-+      ".byte -1\n" \
-+      "lock\n" \
-+      "nop\n" \
-+      ".previous\n"
-+void prepare_for_smp(void);
-+void unprepare_for_smp(void);
-+#else
-+#define LOCK "lock ; "
-+#endif
-+#else
-+#define LOCK ""
-+#endif
-+
-+#endif /* __ASM_SMP_ALT_H__ */
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h 
./include/asm-i386/spinlock.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h   2006-01-03 
03:21:10.000000000 +0000
-+++ ./include/asm-i386/spinlock.h      2006-02-15 20:45:57.000000000 +0000
-@@ -6,6 +6,7 @@
- #include <asm/page.h>
- #include <linux/config.h>
- #include <linux/compiler.h>
-+#include <asm/smp_alt.h>
- 
- /*
-  * Your basic SMP spinlocks, allowing only a single CPU anywhere
-@@ -23,7 +24,8 @@
- 
- #define __raw_spin_lock_string \
-       "\n1:\t" \
--      "lock ; decb %0\n\t" \
-+      LOCK \
-+      "decb %0\n\t" \
-       "jns 3f\n" \
-       "2:\t" \
-       "rep;nop\n\t" \
-@@ -34,7 +36,8 @@
- 
- #define __raw_spin_lock_string_flags \
-       "\n1:\t" \
--      "lock ; decb %0\n\t" \
-+      LOCK \
-+      "decb %0\n\t" \
-       "jns 4f\n\t" \
-       "2:\t" \
-       "testl $0x200, %1\n\t" \
-@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags
- static inline int __raw_spin_trylock(raw_spinlock_t *lock)
- {
-       char oldval;
-+#ifdef CONFIG_SMP_ALTERNATIVES
-       __asm__ __volatile__(
--              "xchgb %b0,%1"
-+              "1:movb %1,%b0\n"
-+              "movb $0,%1\n"
-+              "2:"
-+              ".section __smp_alternatives,\"a\"\n"
-+              ".long 1b\n"
-+              ".long 3f\n"
-+              ".previous\n"
-+              ".section __smp_replacements,\"a\"\n"
-+              "3: .byte 2b - 1b\n"
-+              ".byte 5f-4f\n"
-+              ".byte 0\n"
-+              ".byte 6f-5f\n"
-+              ".byte -1\n"
-+              "4: xchgb %b0,%1\n"
-+              "5: movb %1,%b0\n"
-+              "movb $0,%1\n"
-+              "6:\n"
-+              ".previous\n"
-               :"=q" (oldval), "=m" (lock->slock)
-               :"0" (0) : "memory");
-+#else
-+      __asm__ __volatile__(
-+              "xchgb %b0,%1\n"
-+              :"=q" (oldval), "=m" (lock->slock)
-+              :"0" (0) : "memory");
-+#endif
-       return oldval > 0;
- }
- 
-@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra
- 
- static inline void __raw_read_unlock(raw_rwlock_t *rw)
- {
--      asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
-+      asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
- }
- 
- static inline void __raw_write_unlock(raw_rwlock_t *rw)
- {
--      asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
-+      asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
-                                : "=m" (rw->lock) : : "memory");
- }
- 
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h 
./include/asm-i386/system.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h     2006-02-15 
20:38:57.000000000 +0000
-+++ ./include/asm-i386/system.h        2006-02-15 20:45:57.000000000 +0000
-@@ -5,7 +5,7 @@
- #include <linux/kernel.h>
- #include <asm/segment.h>
- #include <asm/cpufeature.h>
--#include <linux/bitops.h> /* for LOCK_PREFIX */
-+#include <asm/smp_alt.h>
- 
- #ifdef __KERNEL__
- 
-@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo
-       unsigned long prev;
-       switch (size) {
-       case 1:
--              __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
-+              __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
-                                    : "=a"(prev)
-                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
-                                    : "memory");
-               return prev;
-       case 2:
--              __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
-+              __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
-                                    : "=a"(prev)
-                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
-                                    : "memory");
-               return prev;
-       case 4:
--              __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
-+              __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
-                                    : "=a"(prev)
-                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
-                                    : "memory");
-@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc
-                                     unsigned long long new)
- {
-       unsigned long long prev;
--      __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
-+      __asm__ __volatile__(LOCK "cmpxchg8b %3"
-                            : "=A"(prev)
-                            : "b"((unsigned long)new),
-                              "c"((unsigned long)(new >> 32)),
-@@ -503,11 +503,55 @@ struct alt_instr { 
- #endif
- 
- #ifdef CONFIG_SMP
-+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
-+#define smp_alt_mb(instr)                                           \
-+__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
-+                   ".section __smp_alternatives,\"a\"\n"          \
-+                   ".long 6667b\n"                                \
-+                     ".long 6673f\n"                                \
-+                   ".previous\n"                                  \
-+                   ".section __smp_replacements,\"a\"\n"          \
-+                   "6673:.byte 6668b-6667b\n"                     \
-+                   ".byte 6670f-6669f\n"                          \
-+                   ".byte 6671f-6670f\n"                          \
-+                     ".byte 0\n"                                    \
-+                   ".byte %c0\n"                                  \
-+                   "6669:lock;addl $0,0(%%esp)\n"                 \
-+                   "6670:" instr "\n"                             \
-+                   "6671:\n"                                      \
-+                   ".previous\n"                                  \
-+                   :                                              \
-+                   : "i" (X86_FEATURE_XMM2)                       \
-+                   : "memory")
-+#define smp_rmb() smp_alt_mb("lfence")
-+#define smp_mb()  smp_alt_mb("mfence")
-+#define set_mb(var, value) do {                                     \
-+unsigned long __set_mb_temp;                                        \
-+__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
-+                   ".section __smp_alternatives,\"a\"\n"          \
-+                   ".long 6667b\n"                                \
-+                   ".long 6673f\n"                                \
-+                   ".previous\n"                                  \
-+                   ".section __smp_replacements,\"a\"\n"          \
-+                   "6673: .byte 6668b-6667b\n"                    \
-+                   ".byte 6670f-6669f\n"                          \
-+                   ".byte 0\n"                                    \
-+                   ".byte 6671f-6670f\n"                          \
-+                   ".byte -1\n"                                   \
-+                   "6669: xchg %1, %0\n"                          \
-+                   "6670:movl %1, %0\n"                           \
-+                   "6671:\n"                                      \
-+                   ".previous\n"                                  \
-+                   : "=m" (var), "=r" (__set_mb_temp)             \
-+                   : "1" (value)                                  \
-+                   : "memory"); } while (0)
-+#else
- #define smp_mb()      mb()
- #define smp_rmb()     rmb()
-+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
-+#endif
- #define smp_wmb()     wmb()
- #define smp_read_barrier_depends()    read_barrier_depends()
--#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
- #else
- #define smp_mb()      barrier()
- #define smp_rmb()     barrier()
diff -r 88f97bb8f3ae -r 673f62edbfbe 
xen/include/asm-ia64/linux-xen/asm/uaccess.h
--- a/xen/include/asm-ia64/linux-xen/asm/uaccess.h      Wed Mar  1 17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,415 +0,0 @@
-#ifndef _ASM_IA64_UACCESS_H
-#define _ASM_IA64_UACCESS_H
-
-/*
- * This file defines various macros to transfer memory areas across
- * the user/kernel boundary.  This needs to be done carefully because
- * this code is executed in kernel mode and uses user-specified
- * addresses.  Thus, we need to be careful not to let the user to
- * trick us into accessing kernel memory that would normally be
- * inaccessible.  This code is also fairly performance sensitive,
- * so we want to spend as little time doing safety checks as
- * possible.
- *
- * To make matters a bit more interesting, these macros sometimes also
- * called from within the kernel itself, in which case the address
- * validity check must be skipped.  The get_fs() macro tells us what
- * to do: if get_fs()==USER_DS, checking is performed, if
- * get_fs()==KERNEL_DS, checking is bypassed.
- *
- * Note that even if the memory area specified by the user is in a
- * valid address range, it is still possible that we'll get a page
- * fault while accessing it.  This is handled by filling out an
- * exception handler fixup entry for each instruction that has the
- * potential to fault.  When such a fault occurs, the page fault
- * handler checks to see whether the faulting instruction has a fixup
- * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
- * then resumes execution at the continuation point.
- *
- * Based on <asm-alpha/uaccess.h>.
- *
- * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
- *     David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/page-flags.h>
-#include <linux/mm.h>
-
-#include <asm/intrinsics.h>
-#include <asm/pgtable.h>
-#include <asm/io.h>
-
-/*
- * For historical reasons, the following macros are grossly misnamed:
- */
-#define KERNEL_DS      ((mm_segment_t) { ~0UL })               /* cf. 
access_ok() */
-#define USER_DS                ((mm_segment_t) { TASK_SIZE-1 })        /* cf. 
access_ok() */
-
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
-#define get_ds()  (KERNEL_DS)
-#define get_fs()  (current_thread_info()->addr_limit)
-#define set_fs(x) (current_thread_info()->addr_limit = (x))
-
-#define segment_eq(a, b)       ((a).seg == (b).seg)
-
-/*
- * When accessing user memory, we need to make sure the entire area really is 
in
- * user-level space.  In order to do this efficiently, we make sure that the 
page at
- * address TASK_SIZE is never valid.  We also need to make sure that the 
address doesn't
- * point inside the virtually mapped linear page table.
- */
-#ifdef XEN
-#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1)
-#define __access_ok(addr, size, segment) (!IS_VMM_ADDRESS((unsigned 
long)(addr)))
-#else
-#define __access_ok(addr, size, segment)                                       
        \
-({                                                                             
        \
-       __chk_user_ptr(addr);                                                   
        \
-       (likely((unsigned long) (addr) <= (segment).seg)                        
        \
-        && ((segment).seg == KERNEL_DS.seg                                     
        \
-            || likely(REGION_OFFSET((unsigned long) (addr)) < 
RGN_MAP_LIMIT)));        \
-})
-#endif
-#define access_ok(type, addr, size)    __access_ok((addr), (size), get_fs())
-
-/* this function will go away soon - use access_ok() instead */
-static inline int __deprecated
-verify_area (int type, const void __user *addr, unsigned long size)
-{
-       return access_ok(type, addr, size) ? 0 : -EFAULT;
-}
-
-/*
- * These are the main single-value transfer routines.  They automatically
- * use the right size if we just have the right pointer type.
- *
- * Careful to not
- * (a) re-use the arguments for side effects (sizeof/typeof is ok)
- * (b) require any knowledge of processes at this stage
- */
-#define put_user(x, ptr)       __put_user_check((__typeof__(*(ptr))) (x), 
(ptr), sizeof(*(ptr)), get_fs())
-#define get_user(x, ptr)       __get_user_check((x), (ptr), sizeof(*(ptr)), 
get_fs())
-
-/*
- * The "__xxx" versions do not do address space checking, useful when
- * doing multiple accesses to the same area (the programmer has to do the
- * checks by hand with "access_ok()")
- */
-#define __put_user(x, ptr)     __put_user_nocheck((__typeof__(*(ptr))) (x), 
(ptr), sizeof(*(ptr)))
-#define __get_user(x, ptr)     __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-extern long __put_user_unaligned_unknown (void);
-
-#define __put_user_unaligned(x, ptr)                                           
                \
-({                                                                             
                \
-       long __ret;                                                             
                \
-       switch (sizeof(*(ptr))) {                                               
                \
-               case 1: __ret = __put_user((x), (ptr)); break;                  
                \
-               case 2: __ret = (__put_user((x), (u8 __user *)(ptr)))           
                \
-                       | (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1))); 
break;              \
-               case 4: __ret = (__put_user((x), (u16 __user *)(ptr)))          
                \
-                       | (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1))); 
break;            \
-               case 8: __ret = (__put_user((x), (u32 __user *)(ptr)))          
                \
-                       | (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1))); 
break;            \
-               default: __ret = __put_user_unaligned_unknown();                
                \
-       }                                                                       
                \
-       __ret;                                                                  
                \
-})
-
-extern long __get_user_unaligned_unknown (void);
-
-#define __get_user_unaligned(x, ptr)                                           
                \
-({                                                                             
                \
-       long __ret;                                                             
                \
-       switch (sizeof(*(ptr))) {                                               
                \
-               case 1: __ret = __get_user((x), (ptr)); break;                  
                \
-               case 2: __ret = (__get_user((x), (u8 __user *)(ptr)))           
                \
-                       | (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1))); 
break;              \
-               case 4: __ret = (__get_user((x), (u16 __user *)(ptr)))          
                \
-                       | (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1))); 
break;            \
-               case 8: __ret = (__get_user((x), (u32 __user *)(ptr)))          
                \
-                       | (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1))); 
break;            \
-               default: __ret = __get_user_unaligned_unknown();                
                \
-       }                                                                       
                \
-       __ret;                                                                  
                \
-})
-
-#ifdef ASM_SUPPORTED
-  struct __large_struct { unsigned long buf[100]; };
-# define __m(x) (*(struct __large_struct __user *)(x))
-
-/* We need to declare the __ex_table section before we can use it in .xdata.  
*/
-asm (".section \"__ex_table\", \"a\"\n\t.previous");
-
-# define __get_user_size(val, addr, n, err)                                    
                \
-do {                                                                           
                \
-       register long __gu_r8 asm ("r8") = 0;                                   
                \
-       register long __gu_r9 asm ("r9");                                       
                \
-       asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by 
exception handler\n"     \
-            "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"                         
                \
-            "[1:]"                                                             
                \
-            : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));    
                \
-       (err) = __gu_r8;                                                        
                \
-       (val) = __gu_r9;                                                        
                \
-} while (0)
-
-/*
- * The "__put_user_size()" macro tells gcc it reads from memory instead of 
writing it.  This
- * is because they do not write to any memory gcc knows about, so there are no 
aliasing
- * issues.
- */
-# define __put_user_size(val, addr, n, err)                                    
                \
-do {                                                                           
                \
-       register long __pu_r8 asm ("r8") = 0;                                   
                \
-       asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by 
exception handler\n" \
-                     "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"                  
                \
-                     "[1:]"                                                    
                \
-                     : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), 
"0"(__pu_r8));               \
-       (err) = __pu_r8;                                                        
                \
-} while (0)
-
-#else /* !ASM_SUPPORTED */
-# define RELOC_TYPE    2       /* ip-rel */
-# define __get_user_size(val, addr, n, err)                            \
-do {                                                                   \
-       __ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE);   \
-       (err) = ia64_getreg(_IA64_REG_R8);                              \
-       (val) = ia64_getreg(_IA64_REG_R9);                              \
-} while (0)
-# define __put_user_size(val, addr, n, err)                                    
                \
-do {                                                                           
                \
-       __st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned 
long) (val));    \
-       (err) = ia64_getreg(_IA64_REG_R8);                                      
                \
-} while (0)
-#endif /* !ASM_SUPPORTED */
-
-extern void __get_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve 
subroutine-calls, which
- * could clobber r8 and r9 (among others).  Thus, be careful not to evaluate 
it while
- * using r8/r9.
- */
-#define __do_get_user(check, x, ptr, size, segment)                            
        \
-({                                                                             
        \
-       const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);                      
        \
-       __typeof__ (size) __gu_size = (size);                                   
        \
-       long __gu_err = -EFAULT, __gu_val = 0;                                  
        \
-                                                                               
        \
-       if (!check || __access_ok(__gu_ptr, size, segment))                     
        \
-               switch (__gu_size) {                                            
        \
-                     case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); 
break;  \
-                     case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); 
break;  \
-                     case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); 
break;  \
-                     case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); 
break;  \
-                     default: __get_user_unknown(); break;                     
        \
-               }                                                               
        \
-       (x) = (__typeof__(*(__gu_ptr))) __gu_val;                               
        \
-       __gu_err;                                                               
        \
-})
-
-#define __get_user_nocheck(x, ptr, size)       __do_get_user(0, x, ptr, size, 
KERNEL_DS)
-#define __get_user_check(x, ptr, size, segment)        __do_get_user(1, x, 
ptr, size, segment)
-
-extern void __put_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve 
subroutine-calls, which
- * could clobber r8 (among others).  Thus, be careful not to evaluate them 
while using r8.
- */
-#define __do_put_user(check, x, ptr, size, segment)                            
        \
-({                                                                             
        \
-       __typeof__ (x) __pu_x = (x);                                            
        \
-       __typeof__ (*(ptr)) __user *__pu_ptr = (ptr);                           
        \
-       __typeof__ (size) __pu_size = (size);                                   
        \
-       long __pu_err = -EFAULT;                                                
        \
-                                                                               
        \
-       if (!check || __access_ok(__pu_ptr, __pu_size, segment))                
        \
-               switch (__pu_size) {                                            
        \
-                     case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); 
break;    \
-                     case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); 
break;    \
-                     case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); 
break;    \
-                     case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); 
break;    \
-                     default: __put_user_unknown(); break;                     
        \
-               }                                                               
        \
-       __pu_err;                                                               
        \
-})
-
-#define __put_user_nocheck(x, ptr, size)       __do_put_user(0, x, ptr, size, 
KERNEL_DS)
-#define __put_user_check(x, ptr, size, segment)        __do_put_user(1, x, 
ptr, size, segment)
-
-/*
- * Complex access routines
- */
-extern unsigned long __must_check __copy_user (void __user *to, const void 
__user *from,
-                                              unsigned long count);
-
-static inline unsigned long
-__copy_to_user (void __user *to, const void *from, unsigned long count)
-{
-       return __copy_user(to, (void __user *) from, count);
-}
-
-static inline unsigned long
-__copy_from_user (void *to, const void __user *from, unsigned long count)
-{
-       return __copy_user((void __user *) to, from, count);
-}
-
-#define __copy_to_user_inatomic                __copy_to_user
-#define __copy_from_user_inatomic      __copy_from_user
-#define copy_to_user(to, from, n)                                              
        \
-({                                                                             
        \
-       void __user *__cu_to = (to);                                            
        \
-       const void *__cu_from = (from);                                         
        \
-       long __cu_len = (n);                                                    
        \
-                                                                               
        \
-       if (__access_ok(__cu_to, __cu_len, get_fs()))                           
        \
-               __cu_len = __copy_user(__cu_to, (void __user *) __cu_from, 
__cu_len);   \
-       __cu_len;                                                               
        \
-})
-
-#define copy_from_user(to, from, n)                                            
        \
-({                                                                             
        \
-       void *__cu_to = (to);                                                   
        \
-       const void __user *__cu_from = (from);                                  
        \
-       long __cu_len = (n);                                                    
        \
-                                                                               
        \
-       __chk_user_ptr(__cu_from);                                              
        \
-       if (__access_ok(__cu_from, __cu_len, get_fs()))                         
        \
-               __cu_len = __copy_user((void __user *) __cu_to, __cu_from, 
__cu_len);   \
-       __cu_len;                                                               
        \
-})
-
-#define __copy_in_user(to, from, size) __copy_user((to), (from), (size))
-
-static inline unsigned long
-copy_in_user (void __user *to, const void __user *from, unsigned long n)
-{
-       if (likely(access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, 
to, n)))
-               n = __copy_user(to, from, n);
-       return n;
-}
-
-extern unsigned long __do_clear_user (void __user *, unsigned long);
-
-#define __clear_user(to, n)            __do_clear_user(to, n)
-
-#define clear_user(to, n)                                      \
-({                                                             \
-       unsigned long __cu_len = (n);                           \
-       if (__access_ok(to, __cu_len, get_fs()))                \
-               __cu_len = __do_clear_user(to, __cu_len);       \
-       __cu_len;                                               \
-})
-
-
-/*
- * Returns: -EFAULT if exception before terminator, N if the entire buffer 
filled, else
- * strlen.
- */
-extern long __must_check __strncpy_from_user (char *to, const char __user 
*from, long to_len);
-
-#define strncpy_from_user(to, from, n)                                 \
-({                                                                     \
-       const char __user * __sfu_from = (from);                        \
-       long __sfu_ret = -EFAULT;                                       \
-       if (__access_ok(__sfu_from, 0, get_fs()))                       \
-               __sfu_ret = __strncpy_from_user((to), __sfu_from, (n)); \
-       __sfu_ret;                                                      \
-})
-
-/* Returns: 0 if bad, string length+1 (memory size) of string if ok */
-extern unsigned long __strlen_user (const char __user *);
-
-#define strlen_user(str)                               \
-({                                                     \
-       const char __user *__su_str = (str);            \
-       unsigned long __su_ret = 0;                     \
-       if (__access_ok(__su_str, 0, get_fs()))         \
-               __su_ret = __strlen_user(__su_str);     \
-       __su_ret;                                       \
-})
-
-/*
- * Returns: 0 if exception before NUL or reaching the supplied limit
- * (N), a value greater than N if the limit would be exceeded, else
- * strlen.
- */
-extern unsigned long __strnlen_user (const char __user *, long);
-
-#define strnlen_user(str, len)                                 \
-({                                                             \
-       const char __user *__su_str = (str);                    \
-       unsigned long __su_ret = 0;                             \
-       if (__access_ok(__su_str, 0, get_fs()))                 \
-               __su_ret = __strnlen_user(__su_str, len);       \
-       __su_ret;                                               \
-})
-
-/* Generic code can't deal with the location-relative format that we use for 
compactness.  */
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
-
-struct exception_table_entry {
-       int addr;       /* location-relative address of insn this fixup is for 
*/
-       int cont;       /* location-relative continuation addr.; if bit 2 is 
set, r9 is set to 0 */
-};
-
-extern void ia64_handle_exception (struct pt_regs *regs, const struct 
exception_table_entry *e);
-extern const struct exception_table_entry *search_exception_tables (unsigned 
long addr);
-
-static inline int
-ia64_done_with_exception (struct pt_regs *regs)
-{
-       const struct exception_table_entry *e;
-       e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
-       if (e) {
-               ia64_handle_exception(regs, e);
-               return 1;
-       }
-       return 0;
-}
-
-#ifndef XEN
-#define ARCH_HAS_TRANSLATE_MEM_PTR     1
-static __inline__ char *
-xlate_dev_mem_ptr (unsigned long p)
-{
-       struct page *page;
-       char * ptr;
-
-       page = mfn_to_page(p >> PAGE_SHIFT);
-       if (PageUncached(page))
-               ptr = (char *)p + __IA64_UNCACHED_OFFSET;
-       else
-               ptr = __va(p);
-
-       return ptr;
-}
-
-/*
- * Convert a virtual cached kernel memory pointer to an uncached pointer
- */
-static __inline__ char *
-xlate_dev_kmem_ptr (char * p)
-{
-       struct page *page;
-       char * ptr;
-
-       page = virt_to_page((unsigned long)p >> PAGE_SHIFT);
-       if (PageUncached(page))
-               ptr = (char *)__pa(p) + __IA64_UNCACHED_OFFSET;
-       else
-               ptr = p;
-
-       return ptr;
-}
-#endif
-
-#endif /* _ASM_IA64_UACCESS_H */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>