WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] x86, vmx: Enable EPT (Extended PageTable)

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] x86, vmx: Enable EPT (Extended PageTable) support on new Intel processors.
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Wed, 09 Apr 2008 06:50:12 -0700
Delivery-date: Wed, 09 Apr 2008 06:50:24 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1207737032 -3600
# Node ID 9b635405ef901c9939a62d83d53ae681333954ea
# Parent  e1962ac0fb1c03fe4e7d65130e75dd0670eccad6
x86, vmx: Enable EPT (Extended PageTable) support on new Intel processors.

We use the EPT page table as P2M (guest physical to machine
mapping), removing the linear page table when EPT is used for the
domain (see the new file p2m-ept.c). We did this by adding three
operations in the p2m_domain. If VT-d is enabled, the EPT page table
will be used as the VT-d page table as well (i.e. shared).

Signed-off-by: Xin Li <xin.b.li@xxxxxxxxx>
Signed-off-by: Jun Nakajima <jun.nakajima@xxxxxxxxx>
Signed-off-by: Xiaohui Xin <Xiaohui.xin@xxxxxxxxx>
Signed-off-by: Keir Fraser <keir.fraser@xxxxxxxxxx>
---
 tools/libxc/xc_hvm_build.c          |   72 ++++---
 xen/arch/x86/domain.c               |    4 
 xen/arch/x86/hvm/hvm.c              |   27 ++
 xen/arch/x86/hvm/vmx/vmcs.c         |   64 +++++-
 xen/arch/x86/hvm/vmx/vmx.c          |  172 +++++++++++++++--
 xen/arch/x86/mm.c                   |    2 
 xen/arch/x86/mm/hap/Makefile        |    1 
 xen/arch/x86/mm/hap/p2m-ept.c       |  187 ++++++++++++++++++
 xen/arch/x86/mm/p2m.c               |  362 +++++++++++++++++++++---------------
 xen/arch/x86/mm/paging.c            |   13 -
 xen/common/domctl.c                 |    3 
 xen/drivers/passthrough/vtd/iommu.c |   43 +++-
 xen/include/asm-x86/domain.h        |   25 --
 xen/include/asm-x86/hvm/domain.h    |   12 -
 xen/include/asm-x86/hvm/svm/vmcb.h  |    6 
 xen/include/asm-x86/hvm/vmx/vmcs.h  |   39 +++
 xen/include/asm-x86/hvm/vmx/vmx.h   |   55 +++++
 xen/include/asm-x86/p2m.h           |   76 +++----
 xen/include/asm-x86/paging.h        |    2 
 xen/include/public/hvm/params.h     |    3 
 xen/include/xen/hypercall.h         |    1 
 21 files changed, 883 insertions(+), 286 deletions(-)

diff -r e1962ac0fb1c -r 9b635405ef90 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Tue Apr 08 11:41:27 2008 +0100
+++ b/tools/libxc/xc_hvm_build.c        Wed Apr 09 11:30:32 2008 +0100
@@ -20,6 +20,13 @@
 #include <xen/libelf.h>
 
 #define SCRATCH_PFN 0xFFFFF
+
+#define SPECIALPAGE_GUARD    0
+#define SPECIALPAGE_BUFIOREQ 1
+#define SPECIALPAGE_XENSTORE 2
+#define SPECIALPAGE_IOREQ    3
+#define SPECIALPAGE_IDENT_PT 4
+#define NR_SPECIAL_PAGES     5
 
 static void build_e820map(void *e820_page, unsigned long long mem_size)
 {
@@ -77,21 +84,16 @@ static void build_e820map(void *e820_pag
     e820entry[nr_map].type = E820_RESERVED;
     nr_map++;
 
-    /*
-     * Low RAM goes here. Remove 4 pages for: ioreq, bufioreq, and xenstore.
-     *  1. Guard page.
-     *  2. Buffered ioreq.
-     *  3. Xenstore.
-     *  4. Normal ioreq.
-     */
+    /* Low RAM goes here. Reserve space for special pages. */
     e820entry[nr_map].addr = 0x100000;
-    e820entry[nr_map].size = mem_size - 0x100000 - PAGE_SIZE * 4;
+    e820entry[nr_map].size = (mem_size - 0x100000 -
+                              PAGE_SIZE * NR_SPECIAL_PAGES);
     e820entry[nr_map].type = E820_RAM;
     nr_map++;
 
-    /* Explicitly reserve space for special pages. */
-    e820entry[nr_map].addr = mem_size - PAGE_SIZE * 3;
-    e820entry[nr_map].size = PAGE_SIZE * 3;
+    /* Explicitly reserve space for special pages (excluding guard page). */
+    e820entry[nr_map].addr = mem_size - PAGE_SIZE * (NR_SPECIAL_PAGES - 1);
+    e820entry[nr_map].size = PAGE_SIZE * (NR_SPECIAL_PAGES - 1);
     e820entry[nr_map].type = E820_RESERVED;
     nr_map++;
 
@@ -156,10 +158,11 @@ static int setup_guest(int xc_handle,
 {
     xen_pfn_t *page_array = NULL;
     unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT);
-    unsigned long shared_page_nr, entry_eip;
+    unsigned long special_page_nr, entry_eip;
     struct xen_add_to_physmap xatp;
     struct shared_info *shared_info;
     void *e820_page;
+    uint32_t *ident_pt;
     struct elf_binary elf;
     uint64_t v_start, v_end;
     int rc;
@@ -245,29 +248,46 @@ static int setup_guest(int xc_handle,
            sizeof(shared_info->evtchn_mask));
     munmap(shared_info, PAGE_SIZE);
 
-    if ( v_end > HVM_BELOW_4G_RAM_END )
-        shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1;
-    else
-        shared_page_nr = (v_end >> PAGE_SHIFT) - 1;
+    special_page_nr = (((v_end > HVM_BELOW_4G_RAM_END)
+                        ? (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT)
+                        : (v_end >> PAGE_SHIFT))
+                       - NR_SPECIAL_PAGES);
+
+    /* Paranoia: clean special pages. */
+    for ( i = 0; i < NR_SPECIAL_PAGES; i++ )
+        if ( xc_clear_domain_page(xc_handle, dom, special_page_nr + i) )
+            goto error_out;
 
     /* Free the guard page that separates low RAM from special pages. */
     rc = xc_domain_memory_decrease_reservation(
-            xc_handle, dom, 1, 0, &page_array[shared_page_nr-3]);
+        xc_handle, dom, 1, 0, &page_array[special_page_nr]);
     if ( rc != 0 )
     {
         PERROR("Could not deallocate guard page for HVM guest.\n");
         goto error_out;
     }
 
-    /* Paranoia: clean pages. */
-    if ( xc_clear_domain_page(xc_handle, dom, shared_page_nr) ||
-         xc_clear_domain_page(xc_handle, dom, shared_page_nr-1) ||
-         xc_clear_domain_page(xc_handle, dom, shared_page_nr-2) )
-        goto error_out;
-
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, shared_page_nr-1);
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2);
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+                     special_page_nr + SPECIALPAGE_XENSTORE);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+                     special_page_nr + SPECIALPAGE_BUFIOREQ);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+                     special_page_nr + SPECIALPAGE_IOREQ);
+
+    /*
+     * Identity-map page table is required for running with CR0.PG=0 when
+     * using Intel EPT. Create a 32-bit non-PAE page directory of superpages.
+     */
+    if ( (ident_pt = xc_map_foreign_range(
+              xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
+              special_page_nr + SPECIALPAGE_IDENT_PT)) == NULL )
+        goto error_out;
+    for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
+        ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+                       _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+    munmap(ident_pt, PAGE_SIZE);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+                     special_page_nr + SPECIALPAGE_IDENT_PT);
 
     /* Insert JMP <rel32> instruction at address 0x0 to reach entry point. */
     entry_eip = elf_uval(&elf, elf.ehdr, e_entry);
diff -r e1962ac0fb1c -r 9b635405ef90 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/arch/x86/domain.c     Wed Apr 09 11:30:32 2008 +0100
@@ -503,13 +503,15 @@ int arch_domain_create(struct domain *d,
     HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
 #endif
 
-    paging_domain_init(d);
+    if ( (rc = paging_domain_init(d)) != 0 )
+        goto fail;
     paging_initialised = 1;
 
     if ( !is_idle_domain(d) )
     {
         d->arch.ioport_caps = 
             rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
+        rc = -ENOMEM;
         if ( d->arch.ioport_caps == NULL )
             goto fail;
 
diff -r e1962ac0fb1c -r 9b635405ef90 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Wed Apr 09 11:30:32 2008 +0100
@@ -2212,6 +2212,33 @@ long do_hvm_op(unsigned long op, XEN_GUE
                 if ( a.value > HVMPTM_one_missed_tick_pending )
                     goto param_fail;
                 break;
+            case HVM_PARAM_IDENT_PT:
+                rc = -EPERM;
+                if ( current->domain->domain_id != 0 )
+                    goto param_fail;
+
+                rc = -EINVAL;
+                if ( d->arch.hvm_domain.params[a.index] != 0 )
+                    goto param_fail;
+
+                if ( !paging_mode_hap(d) )
+                    break;
+
+                domain_pause(d);
+
+                /*
+                 * Update GUEST_CR3 in each VMCS to point at identity map.
+                 * All foreign updates to guest state must synchronise on
+                 * the domctl_lock.
+                 */
+                spin_lock(&domctl_lock);
+                d->arch.hvm_domain.params[a.index] = a.value;
+                for_each_vcpu ( d, v )
+                    paging_update_cr3(v);
+                spin_unlock(&domctl_lock);
+
+                domain_unpause(d);
+                break;
             }
             d->arch.hvm_domain.params[a.index] = a.value;
             rc = 0;
diff -r e1962ac0fb1c -r 9b635405ef90 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Wed Apr 09 11:30:32 2008 +0100
@@ -84,14 +84,16 @@ static void vmx_init_vmcs_config(void)
 
     min = (CPU_BASED_HLT_EXITING |
            CPU_BASED_INVLPG_EXITING |
+           CPU_BASED_CR3_LOAD_EXITING |
+           CPU_BASED_CR3_STORE_EXITING |
            CPU_BASED_MONITOR_EXITING |
            CPU_BASED_MWAIT_EXITING |
            CPU_BASED_MOV_DR_EXITING |
            CPU_BASED_ACTIVATE_IO_BITMAP |
            CPU_BASED_USE_TSC_OFFSETING);
-    opt  = CPU_BASED_ACTIVATE_MSR_BITMAP;
-    opt |= CPU_BASED_TPR_SHADOW;
-    opt |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+    opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
+           CPU_BASED_TPR_SHADOW |
+           CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
     _vmx_cpu_based_exec_control = adjust_vmx_controls(
         min, opt, MSR_IA32_VMX_PROCBASED_CTLS);
 #ifdef __x86_64__
@@ -107,9 +109,21 @@ static void vmx_init_vmcs_config(void)
     {
         min = 0;
         opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-               SECONDARY_EXEC_WBINVD_EXITING);
+               SECONDARY_EXEC_WBINVD_EXITING |
+               SECONDARY_EXEC_ENABLE_EPT);
         _vmx_secondary_exec_control = adjust_vmx_controls(
             min, opt, MSR_IA32_VMX_PROCBASED_CTLS2);
+    }
+
+    if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
+    {
+        /* To use EPT we expect to be able to clear certain intercepts. */
+        uint32_t must_be_one, must_be_zero;
+        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, must_be_one, must_be_zero);
+        if ( must_be_one & (CPU_BASED_INVLPG_EXITING |
+                            CPU_BASED_CR3_LOAD_EXITING |
+                            CPU_BASED_CR3_STORE_EXITING) )
+            _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
     }
 
 #if defined(__i386__)
@@ -301,6 +315,8 @@ int vmx_cpu_up(void)
         return 0;
     }
 
+    ept_sync_all();
+
     return 1;
 }
 
@@ -439,6 +455,7 @@ void vmx_disable_intercept_for_msr(struc
 
 static int construct_vmcs(struct vcpu *v)
 {
+    struct domain *d = v->domain;
     uint16_t sysenter_cs;
     unsigned long sysenter_eip;
 
@@ -448,10 +465,25 @@ static int construct_vmcs(struct vcpu *v
     __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
     __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
     __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
-    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control);
+
     v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
-    if ( vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
-        __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control);
+    v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
+
+    if ( paging_mode_hap(d) )
+    {
+        v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
+                                          CPU_BASED_CR3_LOAD_EXITING |
+                                          CPU_BASED_CR3_STORE_EXITING);
+    }
+    else
+    {
+        v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+    }
+
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+    if ( cpu_has_vmx_secondary_exec_control )
+        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+                  v->arch.hvm_vmx.secondary_exec_control);
 
     /* MSR access bitmap. */
     if ( cpu_has_vmx_msr_bitmap )
@@ -570,9 +602,10 @@ static int construct_vmcs(struct vcpu *v
     __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL);
 #endif
 
-    __vmwrite(EXCEPTION_BITMAP, (HVM_TRAP_MASK |
-                                 (1U << TRAP_page_fault) |
-                                 (1U << TRAP_no_device)));
+    __vmwrite(EXCEPTION_BITMAP,
+              HVM_TRAP_MASK
+              | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault))
+              | (1U << TRAP_no_device));
 
     v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
     hvm_update_guest_cr(v, 0);
@@ -585,6 +618,15 @@ static int construct_vmcs(struct vcpu *v
         __vmwrite(VIRTUAL_APIC_PAGE_ADDR,
                   page_to_maddr(vcpu_vlapic(v)->regs_page));
         __vmwrite(TPR_THRESHOLD, 0);
+    }
+
+    if ( paging_mode_hap(d) )
+    {
+        __vmwrite(EPT_POINTER, d->arch.hvm_domain.vmx.ept_control.eptp);
+#ifdef CONFIG_X86_PAE
+        __vmwrite(EPT_POINTER_HIGH,
+                  d->arch.hvm_domain.vmx.ept_control.eptp >> 32);
+#endif
     }
 
     vmx_vmcs_exit(v);
@@ -932,6 +974,8 @@ void vmcs_dump_vcpu(struct vcpu *v)
            (uint32_t)vmr(IDT_VECTORING_ERROR_CODE));
     printk("TPR Threshold = 0x%02x\n",
            (uint32_t)vmr(TPR_THRESHOLD));
+    printk("EPT pointer = 0x%08x%08x\n",
+           (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER));
 
     vmx_vmcs_exit(v);
 }
diff -r e1962ac0fb1c -r 9b635405ef90 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Apr 09 11:30:32 2008 +0100
@@ -71,11 +71,17 @@ static void vmx_invlpg_intercept(unsigne
 
 static int vmx_domain_initialise(struct domain *d)
 {
+    d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
+    d->arch.hvm_domain.vmx.ept_control.gaw  = EPT_DEFAULT_GAW;
+    d->arch.hvm_domain.vmx.ept_control.asr  =
+        pagetable_get_pfn(d->arch.phys_table);
+
     return vmx_alloc_vlapic_mapping(d);
 }
 
 static void vmx_domain_destroy(struct domain *d)
 {
+    ept_sync_domain(d);
     vmx_free_vlapic_mapping(d);
 }
 
@@ -492,20 +498,23 @@ static int vmx_restore_cr0_cr3(
     unsigned long mfn = 0;
     p2m_type_t p2mt;
 
-    if ( cr0 & X86_CR0_PG )
-    {
-        mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
-        if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
+    if ( paging_mode_shadow(v->domain) )
+    {
+        if ( cr0 & X86_CR0_PG )
         {
-            gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
-            return -EINVAL;
+            mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
+            if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
+            {
+                gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
+                return -EINVAL;
+            }
         }
-    }
-
-    if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
-        put_page(pagetable_get_page(v->arch.guest_table));
-
-    v->arch.guest_table = pagetable_from_pfn(mfn);
+
+        if ( hvm_paging_enabled(v) )
+            put_page(pagetable_get_page(v->arch.guest_table));
+
+        v->arch.guest_table = pagetable_from_pfn(mfn);
+    }
 
     v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
     v->arch.hvm_vcpu.guest_cr[3] = cr3;
@@ -900,6 +909,56 @@ static void vmx_set_interrupt_shadow(str
     __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
 }
 
+static void vmx_load_pdptrs(struct vcpu *v)
+{
+    unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
+    uint64_t *guest_pdptrs;
+    p2m_type_t p2mt;
+    char *p;
+
+    /* EPT needs to load PDPTRS into VMCS for PAE. */
+    if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
+        return;
+
+    if ( cr3 & 0x1fUL )
+        goto crash;
+
+    mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
+    if ( !p2m_is_ram(p2mt) )
+        goto crash;
+
+    p = map_domain_page(mfn);
+
+    guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
+
+    /*
+     * We do not check the PDPTRs for validity. The CPU will do this during
+     * vm entry, and we can handle the failure there and crash the guest.
+     * The only thing we could do better here is #GP instead.
+     */
+
+    vmx_vmcs_enter(v);
+
+    __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
+    __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
+    __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
+    __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
+#ifdef CONFIG_X86_PAE
+    __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
+    __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
+    __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
+    __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
+#endif
+
+    vmx_vmcs_exit(v);
+
+    unmap_domain_page(p);
+    return;
+
+ crash:
+    domain_crash(v->domain);
+}
+
 static void vmx_update_host_cr3(struct vcpu *v)
 {
     vmx_vmcs_enter(v);
@@ -915,7 +974,24 @@ static void vmx_update_guest_cr(struct v
     {
     case 0: {
         unsigned long hw_cr0_mask =
-            X86_CR0_NE | X86_CR0_PG | X86_CR0_WP | X86_CR0_PE;
+            X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
+
+        if ( paging_mode_shadow(v->domain) )
+           hw_cr0_mask |= X86_CR0_WP;
+
+        if ( paging_mode_hap(v->domain) )
+        {
+            /* We manage GUEST_CR3 when guest CR0.PE is zero. */
+            uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
+                                 CPU_BASED_CR3_STORE_EXITING);
+            v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
+            if ( !hvm_paging_enabled(v) )
+                v->arch.hvm_vmx.exec_control |= cr3_ctls;
+            __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+
+            /* Changing CR0.PE can change some bits in real CR4. */
+            vmx_update_guest_cr(v, 4);
+        }
 
         if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
         {
@@ -939,11 +1015,26 @@ static void vmx_update_guest_cr(struct v
         /* CR2 is updated in exit stub. */
         break;
     case 3:
+        if ( paging_mode_hap(v->domain) )
+        {
+            if ( !hvm_paging_enabled(v) )
+                v->arch.hvm_vcpu.hw_cr[3] =
+                    v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
+            vmx_load_pdptrs(v);
+        }
+ 
         __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
         break;
     case 4:
-        v->arch.hvm_vcpu.hw_cr[4] =
-            v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
+        v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
+        if ( paging_mode_hap(v->domain) )
+            v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
+        v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
+        if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
+        {
+            v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
+            v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
+        }
         __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
         __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
         break;
@@ -983,7 +1074,18 @@ static void vmx_flush_guest_tlbs(void)
      * because VMRESUME will flush it for us. */
 }
 
-
+static void __ept_sync_domain(void *info)
+{
+    struct domain *d = info;
+    __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
+}
+
+void ept_sync_domain(struct domain *d)
+{
+    /* Only if using EPT and this domain has some VCPUs to dirty. */
+    if ( d->arch.hvm_domain.hap_enabled && d->vcpu[0] )
+        on_each_cpu(__ept_sync_domain, d, 1, 1);
+}
 
 static void __vmx_inject_exception(
     struct vcpu *v, int trap, int type, int error_code)
@@ -1133,6 +1235,12 @@ void start_vmx(void)
         return;
     }
 
+    if ( cpu_has_vmx_ept )
+    {
+        printk("VMX: EPT is available.\n");
+        vmx_function_table.hap_supported = 1;
+    }
+
     setup_vmcs_dump();
 
     hvm_enable(&vmx_function_table);
@@ -1635,14 +1743,14 @@ static int vmx_alloc_vlapic_mapping(stru
     share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
     set_mmio_p2m_entry(
         d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
-    d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
+    d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
 
     return 0;
 }
 
 static void vmx_free_vlapic_mapping(struct domain *d)
 {
-    unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
+    unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
     if ( mfn != 0 )
         free_xenheap_page(mfn_to_virt(mfn));
 }
@@ -1655,7 +1763,7 @@ static void vmx_install_vlapic_mapping(s
         return;
 
     virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
-    apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
+    apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
     apic_page_ma <<= PAGE_SHIFT;
 
     vmx_vmcs_enter(v);
@@ -1900,6 +2008,17 @@ static void vmx_wbinvd_intercept(void)
         wbinvd();
 }
 
+static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
+{
+    if ( unlikely(((qualification >> 7) & 0x3) != 0x3) )
+    {
+        domain_crash(current->domain);
+        return;
+    }
+
+    handle_mmio();
+}
+
 static void vmx_failed_vmentry(unsigned int exit_reason,
                                struct cpu_user_regs *regs)
 {
@@ -1938,6 +2057,10 @@ asmlinkage void vmx_vmexit_handler(struc
     unsigned int exit_reason, idtv_info;
     unsigned long exit_qualification, inst_len = 0;
     struct vcpu *v = current;
+
+    if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
+        v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
+            __vmread(GUEST_CR3);
 
     exit_reason = __vmread(VM_EXIT_REASON);
 
@@ -2171,6 +2294,17 @@ asmlinkage void vmx_vmexit_handler(struc
         break;
     }
 
+    case EXIT_REASON_EPT_VIOLATION:
+    {
+        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+#ifdef CONFIG_X86_PAE
+        gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
+#endif
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        ept_handle_violation(exit_qualification, gpa);
+        break;
+    }
+
     default:
     exit_and_crash:
         gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
diff -r e1962ac0fb1c -r 9b635405ef90 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/arch/x86/mm.c Wed Apr 09 11:30:32 2008 +0100
@@ -299,7 +299,7 @@ unsigned long domain_get_maximum_gpfn(st
 unsigned long domain_get_maximum_gpfn(struct domain *d)
 {
     if ( is_hvm_domain(d) )
-        return d->arch.p2m.max_mapped_pfn;
+        return d->arch.p2m->max_mapped_pfn;
     /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
     return arch_get_max_pfn(d) - 1;
 }
diff -r e1962ac0fb1c -r 9b635405ef90 xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile      Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/arch/x86/mm/hap/Makefile      Wed Apr 09 11:30:32 2008 +0100
@@ -2,6 +2,7 @@ obj-y += guest_walk_2level.o
 obj-y += guest_walk_2level.o
 obj-y += guest_walk_3level.o
 obj-y += guest_walk_4level.o
+obj-y += p2m-ept.o
 
 guest_levels  = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
 guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff -r e1962ac0fb1c -r 9b635405ef90 xen/arch/x86/mm/hap/p2m-ept.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/hap/p2m-ept.c     Wed Apr 09 11:30:32 2008 +0100
@@ -0,0 +1,187 @@
+/*
+ * ept-p2m.c: use the EPT page table as p2m
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/config.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/current.h>
+#include <asm/types.h>
+#include <asm/domain.h>
+#include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <xen/iommu.h>
+
+static int ept_next_level(struct domain *d, bool_t read_only,
+                          ept_entry_t **table, unsigned long *gfn_remainder,
+                          u32 shift)
+{
+    ept_entry_t *ept_entry, *next;
+    u32 index;
+
+    index = *gfn_remainder >> shift;
+    *gfn_remainder &= (1UL << shift) - 1;
+
+    ept_entry = (*table) + index;
+
+    if ( !(ept_entry->epte & 0x7) )
+    {
+        struct page_info *pg;
+
+        if ( read_only )
+            return 0;
+
+        pg = d->arch.p2m->alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+
+        pg->count_info = 1;
+        pg->u.inuse.type_info = 1 | PGT_validated;
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
+
+        ept_entry->emt = 0;
+        ept_entry->sp_avail = 0;
+        ept_entry->avail1 = 0;
+        ept_entry->mfn = page_to_mfn(pg);
+        ept_entry->rsvd = 0;
+        ept_entry->avail2 = 0;
+        /* last step */
+        ept_entry->r = ept_entry->w = ept_entry->x = 1;
+    }
+
+    next = map_domain_page(ept_entry->mfn);
+    unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+static int
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+{
+    ept_entry_t *table =
+        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry = NULL;
+    u32 index;
+    int i, rv = 0;
+
+    /* Should check if gfn obeys GAW here */
+
+    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
+        if ( !ept_next_level(d, 0, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER) )
+            goto out;
+
+    index = gfn_remainder;
+    ept_entry = table + index;
+
+    if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
+    {
+        /* Track the highest gfn for which we have ever had a valid mapping */
+        if ( gfn > d->arch.p2m->max_mapped_pfn )
+            d->arch.p2m->max_mapped_pfn = gfn;
+
+        ept_entry->emt = EPT_DEFAULT_MT;
+        ept_entry->sp_avail = 0;
+        ept_entry->avail1 = p2mt;
+        ept_entry->mfn = mfn_x(mfn);
+        ept_entry->rsvd = 0;
+        ept_entry->avail2 = 0;
+        /* last step */
+        ept_entry->r = ept_entry->w = ept_entry->x = 1;
+    }
+    else
+        ept_entry->epte = 0;
+
+    /* Success */
+    rv = 1;
+
+ out:
+    unmap_domain_page(table);
+
+    ept_sync_domain(d);
+
+    /* If p2m table is shared with vtd page-table. */
+    if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
+        iommu_flush(d, gfn, (u64*)ept_entry);
+
+    return rv;
+}
+
+/* Read ept p2m entries */
+static mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+    ept_entry_t *table =
+        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry;
+    u32 index;
+    int i;
+    mfn_t mfn = _mfn(INVALID_MFN);
+
+    *t = p2m_mmio_dm;
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > d->arch.p2m->max_mapped_pfn )
+        goto out;
+
+    /* Should check if gfn obeys GAW here. */
+
+    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
+        if ( !ept_next_level(d, 1, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER) )
+            goto out;
+
+    index = gfn_remainder;
+    ept_entry = table + index;
+
+    if ( (ept_entry->epte & 0x7) == 0x7 )
+    {
+        if ( ept_entry->avail1 != p2m_invalid )
+        {
+            *t = ept_entry->avail1;
+            mfn = _mfn(ept_entry->mfn);
+        }
+    }
+
+ out:
+    unmap_domain_page(table);
+    return mfn;
+}
+
+static mfn_t ept_get_entry_current(unsigned long gfn, p2m_type_t *t)
+{
+    return ept_get_entry(current->domain, gfn, t);
+}
+
+void ept_p2m_init(struct domain *d)
+{
+    d->arch.p2m->set_entry = ept_set_entry;
+    d->arch.p2m->get_entry = ept_get_entry;
+    d->arch.p2m->get_entry_current = ept_get_entry_current;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r e1962ac0fb1c -r 9b635405ef90 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/arch/x86/mm/p2m.c     Wed Apr 09 11:30:32 2008 +0100
@@ -27,6 +27,7 @@
 #include <asm/page.h>
 #include <asm/paging.h>
 #include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
 #include <xen/iommu.h>
 
 /* Debugging and auditing of the P2M code? */
@@ -41,35 +42,34 @@
  * Locking discipline: always acquire this lock before the shadow or HAP one
  */
 
-#define p2m_lock_init(_d)                            \
-    do {                                             \
-        spin_lock_init(&(_d)->arch.p2m.lock);        \
-        (_d)->arch.p2m.locker = -1;                  \
-        (_d)->arch.p2m.locker_function = "nobody";   \
+#define p2m_lock_init(_p2m)                     \
+    do {                                        \
+        spin_lock_init(&(_p2m)->lock);          \
+        (_p2m)->locker = -1;                    \
+        (_p2m)->locker_function = "nobody";     \
     } while (0)
 
-#define p2m_lock(_d)                                                \
-    do {                                                            \
-        if ( unlikely((_d)->arch.p2m.locker == current->processor) )\
-        {                                                           \
-            printk("Error: p2m lock held by %s\n",                  \
-                   (_d)->arch.p2m.locker_function);                 \
-            BUG();                                                  \
-        }                                                           \
-        spin_lock(&(_d)->arch.p2m.lock);                            \
-        ASSERT((_d)->arch.p2m.locker == -1);                        \
-        (_d)->arch.p2m.locker = current->processor;                 \
-        (_d)->arch.p2m.locker_function = __func__;                  \
+#define p2m_lock(_p2m)                                          \
+    do {                                                        \
+        if ( unlikely((_p2m)->locker == current->processor) )   \
+        {                                                       \
+            printk("Error: p2m lock held by %s\n",              \
+                   (_p2m)->locker_function);                    \
+            BUG();                                              \
+        }                                                       \
+        spin_lock(&(_p2m)->lock);                               \
+        ASSERT((_p2m)->locker == -1);                           \
+        (_p2m)->locker = current->processor;                    \
+        (_p2m)->locker_function = __func__;                     \
     } while (0)
 
-#define p2m_unlock(_d)                                              \
-    do {                                                            \
-        ASSERT((_d)->arch.p2m.locker == current->processor); \
-        (_d)->arch.p2m.locker = -1;                          \
-        (_d)->arch.p2m.locker_function = "nobody";           \
-        spin_unlock(&(_d)->arch.p2m.lock);                   \
+#define p2m_unlock(_p2m)                                \
+    do {                                                \
+        ASSERT((_p2m)->locker == current->processor);   \
+        (_p2m)->locker = -1;                            \
+        (_p2m)->locker_function = "nobody";             \
+        spin_unlock(&(_p2m)->lock);                     \
     } while (0)
-
 
 
 /* Printouts */
@@ -152,7 +152,7 @@ p2m_next_level(struct domain *d, mfn_t *
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t new_entry;
     void *next;
-    ASSERT(d->arch.p2m.alloc_page);
+    ASSERT(d->arch.p2m->alloc_page);
 
     if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
                                       shift, max)) )
@@ -160,10 +160,10 @@ p2m_next_level(struct domain *d, mfn_t *
 
     if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
     {
-        struct page_info *pg = d->arch.p2m.alloc_page(d);
+        struct page_info *pg = d->arch.p2m->alloc_page(d);
         if ( pg == NULL )
             return 0;
-        list_add_tail(&pg->list, &d->arch.p2m.pages);
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
         pg->u.inuse.type_info = type | 1 | PGT_validated;
         pg->count_info = 1;
 
@@ -202,7 +202,7 @@ p2m_next_level(struct domain *d, mfn_t *
 
 // Returns 0 on error (out of memory)
 static int
-set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
 {
     // XXX -- this might be able to be faster iff current->domain == d
     mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -244,8 +244,8 @@ set_p2m_entry(struct domain *d, unsigned
     ASSERT(p2m_entry);
 
     /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn) && (gfn > d->arch.p2m.max_mapped_pfn) )
-        d->arch.p2m.max_mapped_pfn = gfn;
+    if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
+        d->arch.p2m->max_mapped_pfn = gfn;
 
     if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
         entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
@@ -279,14 +279,158 @@ set_p2m_entry(struct domain *d, unsigned
     return rv;
 }
 
+static mfn_t
+p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+    mfn_t mfn;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
+    l2_pgentry_t *l2e;
+    l1_pgentry_t *l1e;
+
+    ASSERT(paging_mode_translate(d));
+
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+    *t = p2m_mmio_dm;
+
+    mfn = pagetable_get_mfn(d->arch.phys_table);
+
+    if ( gfn > d->arch.p2m->max_mapped_pfn )
+        /* This pfn is higher than the highest the p2m map currently holds */
+        return _mfn(INVALID_MFN);
+
+#if CONFIG_PAGING_LEVELS >= 4
+    {
+        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
+        l4e += l4_table_offset(addr);
+        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+        {
+            unmap_domain_page(l4e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l4e_get_pfn(*l4e));
+        unmap_domain_page(l4e);
+    }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    {
+        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
+#if CONFIG_PAGING_LEVELS == 3
+        /* On PAE hosts the p2m has eight l3 entries, not four (see
+         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
+         * Instead, just count the number of l3es from zero.  It's safe
+         * to do this because we already checked that the gfn is within
+         * the bounds of the p2m. */
+        l3e += (addr >> L3_PAGETABLE_SHIFT);
+#else
+        l3e += l3_table_offset(addr);
+#endif
+        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        {
+            unmap_domain_page(l3e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l3e_get_pfn(*l3e));
+        unmap_domain_page(l3e);
+    }
+#endif
+
+    l2e = map_domain_page(mfn_x(mfn));
+    l2e += l2_table_offset(addr);
+    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    {
+        unmap_domain_page(l2e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l2e_get_pfn(*l2e));
+    unmap_domain_page(l2e);
+
+    l1e = map_domain_page(mfn_x(mfn));
+    l1e += l1_table_offset(addr);
+    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+    {
+        unmap_domain_page(l1e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l1e_get_pfn(*l1e));
+    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
+    unmap_domain_page(l1e);
+
+    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+    return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+}
+
+/* Read the current domain's p2m table (through the linear mapping). */
+static mfn_t p2m_gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
+{
+    mfn_t mfn = _mfn(INVALID_MFN);
+    p2m_type_t p2mt = p2m_mmio_dm;
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+
+    if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
+    {
+        l1_pgentry_t l1e = l1e_empty();
+        int ret;
+
+        ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
+               / sizeof(l1_pgentry_t));
+
+        /* Need to __copy_from_user because the p2m is sparse and this
+         * part might not exist */
+        ret = __copy_from_user(&l1e,
+                               &phys_to_machine_mapping[gfn],
+                               sizeof(l1e));
+
+        if ( ret == 0 ) {
+            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+            if ( p2m_is_valid(p2mt) )
+                mfn = _mfn(l1e_get_pfn(l1e));
+            else 
+                /* XXX see above */
+                p2mt = p2m_mmio_dm;
+        }
+    }
+
+    *t = p2mt;
+    return mfn;
+}
 
 /* Init the datastructures for later use by the p2m code */
-void p2m_init(struct domain *d)
-{
-    p2m_lock_init(d);
-    INIT_LIST_HEAD(&d->arch.p2m.pages);
-}
-
+int p2m_init(struct domain *d)
+{
+    struct p2m_domain *p2m;
+
+    p2m = xmalloc(struct p2m_domain);
+    if ( p2m == NULL )
+        return -ENOMEM;
+
+    d->arch.p2m = p2m;
+
+    p2m_lock_init(p2m);
+    INIT_LIST_HEAD(&p2m->pages);
+
+    p2m->set_entry = p2m_set_entry;
+    p2m->get_entry = p2m_gfn_to_mfn;
+    p2m->get_entry_current = p2m_gfn_to_mfn_current;
+
+    if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled &&
+         (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
+        ept_p2m_init(d);
+
+    return 0;
+}
+
+static inline
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t 
p2mt)
+{
+    return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+}
 
 // Allocate a new p2m table for a domain.
 //
@@ -308,28 +452,29 @@ int p2m_alloc_table(struct domain *d,
     struct page_info *page, *p2m_top;
     unsigned int page_count = 0;
     unsigned long gfn = -1UL;
-
-    p2m_lock(d);
+    struct p2m_domain *p2m = d->arch.p2m;
+
+    p2m_lock(p2m);
 
     if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
     {
         P2M_ERROR("p2m already allocated for this domain\n");
-        p2m_unlock(d);
+        p2m_unlock(p2m);
         return -EINVAL;
     }
 
     P2M_PRINTK("allocating p2m table\n");
 
-    d->arch.p2m.alloc_page = alloc_page;
-    d->arch.p2m.free_page = free_page;
-
-    p2m_top = d->arch.p2m.alloc_page(d);
+    p2m->alloc_page = alloc_page;
+    p2m->free_page = free_page;
+
+    p2m_top = p2m->alloc_page(d);
     if ( p2m_top == NULL )
     {
-        p2m_unlock(d);
+        p2m_unlock(p2m);
         return -ENOMEM;
     }
-    list_add_tail(&p2m_top->list, &d->arch.p2m.pages);
+    list_add_tail(&p2m_top->list, &p2m->pages);
 
     p2m_top->count_info = 1;
     p2m_top->u.inuse.type_info =
@@ -376,13 +521,13 @@ int p2m_alloc_table(struct domain *d,
 #endif
 
     P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
-    p2m_unlock(d);
+    p2m_unlock(p2m);
     return 0;
 
  error:
     P2M_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
                PRI_mfn "\n", gfn, mfn_x(mfn));
-    p2m_unlock(d);
+    p2m_unlock(p2m);
     return -ENOMEM;
 }
 
@@ -392,101 +537,24 @@ void p2m_teardown(struct domain *d)
 {
     struct list_head *entry, *n;
     struct page_info *pg;
-
-    p2m_lock(d);
+    struct p2m_domain *p2m = d->arch.p2m;
+
+    p2m_lock(p2m);
     d->arch.phys_table = pagetable_null();
 
-    list_for_each_safe(entry, n, &d->arch.p2m.pages)
+    list_for_each_safe(entry, n, &p2m->pages)
     {
         pg = list_entry(entry, struct page_info, list);
         list_del(entry);
-        d->arch.p2m.free_page(d, pg);
-    }
-    p2m_unlock(d);
-}
-
-mfn_t
-gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t)
-/* Read another domain's p2m entries */
-{
-    mfn_t mfn;
-    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
-    l2_pgentry_t *l2e;
-    l1_pgentry_t *l1e;
-
-    ASSERT(paging_mode_translate(d));
-
-    /* XXX This is for compatibility with the old model, where anything not 
-     * XXX marked as RAM was considered to be emulated MMIO space.
-     * XXX Once we start explicitly registering MMIO regions in the p2m 
-     * XXX we will return p2m_invalid for unmapped gfns */
-    *t = p2m_mmio_dm;
-
-    mfn = pagetable_get_mfn(d->arch.phys_table);
-
-    if ( gfn > d->arch.p2m.max_mapped_pfn )
-        /* This pfn is higher than the highest the p2m map currently holds */
-        return _mfn(INVALID_MFN);
-
-#if CONFIG_PAGING_LEVELS >= 4
-    {
-        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
-        l4e += l4_table_offset(addr);
-        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
-        {
-            unmap_domain_page(l4e);
-            return _mfn(INVALID_MFN);
-        }
-        mfn = _mfn(l4e_get_pfn(*l4e));
-        unmap_domain_page(l4e);
-    }
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
-    {
-        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
-#if CONFIG_PAGING_LEVELS == 3
-        /* On PAE hosts the p2m has eight l3 entries, not four (see
-         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
-         * Instead, just count the number of l3es from zero.  It's safe
-         * to do this because we already checked that the gfn is within
-         * the bounds of the p2m. */
-        l3e += (addr >> L3_PAGETABLE_SHIFT);
-#else
-        l3e += l3_table_offset(addr);
-#endif
-        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
-        {
-            unmap_domain_page(l3e);
-            return _mfn(INVALID_MFN);
-        }
-        mfn = _mfn(l3e_get_pfn(*l3e));
-        unmap_domain_page(l3e);
-    }
-#endif
-
-    l2e = map_domain_page(mfn_x(mfn));
-    l2e += l2_table_offset(addr);
-    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
-    {
-        unmap_domain_page(l2e);
-        return _mfn(INVALID_MFN);
-    }
-    mfn = _mfn(l2e_get_pfn(*l2e));
-    unmap_domain_page(l2e);
-
-    l1e = map_domain_page(mfn_x(mfn));
-    l1e += l1_table_offset(addr);
-    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
-    {
-        unmap_domain_page(l1e);
-        return _mfn(INVALID_MFN);
-    }
-    mfn = _mfn(l1e_get_pfn(*l1e));
-    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
-    unmap_domain_page(l1e);
-
-    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-    return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+        p2m->free_page(d, pg);
+    }
+    p2m_unlock(p2m);
+}
+
+void p2m_final_teardown(struct domain *d)
+{
+    xfree(d->arch.p2m);
+    d->arch.p2m = NULL;
 }
 
 #if P2M_AUDIT
@@ -564,7 +632,7 @@ static void audit_p2m(struct domain *d)
             set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
         }
 
-        if ( test_linear && (gfn <= d->arch.p2m.max_mapped_pfn) )
+        if ( test_linear && (gfn <= d->arch.p2m->max_mapped_pfn) )
         {
             lp2mfn = mfn_x(gfn_to_mfn_current(gfn, &type));
             if ( lp2mfn != mfn_x(p2mfn) )
@@ -695,11 +763,11 @@ guest_physmap_remove_page(struct domain 
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
                           unsigned long mfn)
 {
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
     audit_p2m(d);
     p2m_remove_page(d, gfn, mfn);
     audit_p2m(d);
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
 }
 
 int
@@ -722,7 +790,7 @@ guest_physmap_add_entry(struct domain *d
      */
     if ( paging_mode_hap(d) && (gfn > 0xfffffUL) )
     {
-        if ( !test_and_set_bool(d->arch.hvm_domain.amd_npt_4gb_warning) )
+        if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
             dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
                     " 4GB: specify 'hap=0' domain config option.\n",
                     d->domain_id);
@@ -730,7 +798,7 @@ guest_physmap_add_entry(struct domain *d
     }
 #endif
 
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
     audit_p2m(d);
 
     P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
@@ -781,7 +849,7 @@ guest_physmap_add_entry(struct domain *d
     }
 
     audit_p2m(d);
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
 
     return rc;
 }
@@ -812,7 +880,7 @@ void p2m_change_type_global(struct domai
     if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
         return;
 
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
 
 #if CONFIG_PAGING_LEVELS == 4
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
@@ -884,7 +952,7 @@ void p2m_change_type_global(struct domai
     unmap_domain_page(l2e);
 #endif
 
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
 }
 
 /* Modify the p2m type of a single gfn from ot to nt, returning the 
@@ -895,13 +963,13 @@ p2m_type_t p2m_change_type(struct domain
     p2m_type_t pt;
     mfn_t mfn;
 
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
 
     mfn = gfn_to_mfn(d, gfn, &pt);
     if ( pt == ot )
         set_p2m_entry(d, gfn, mfn, nt);
 
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
 
     return pt;
 }
diff -r e1962ac0fb1c -r 9b635405ef90 xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c  Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/arch/x86/mm/paging.c  Wed Apr 09 11:30:32 2008 +0100
@@ -484,9 +484,12 @@ void paging_log_dirty_teardown(struct do
 /*           CODE FOR PAGING SUPPORT            */
 /************************************************/
 /* Domain paging struct initialization. */
-void paging_domain_init(struct domain *d)
-{
-    p2m_init(d);
+int paging_domain_init(struct domain *d)
+{
+    int rc;
+
+    if ( (rc = p2m_init(d)) != 0 )
+        return rc;
 
     /* The order of the *_init calls below is important, as the later
      * ones may rewrite some common fields.  Shadow pagetables are the
@@ -496,6 +499,8 @@ void paging_domain_init(struct domain *d
     /* ... but we will use hardware assistance if it's available. */
     if ( hap_enabled(d) )
         hap_domain_init(d);
+
+    return 0;
 }
 
 /* vcpu paging struct initialization goes here */
@@ -589,6 +594,8 @@ void paging_final_teardown(struct domain
         hap_final_teardown(d);
     else
         shadow_final_teardown(d);
+
+    p2m_final_teardown(d);
 }
 
 /* Enable an arbitrary paging-assistance mode.  Call once at domain
diff -r e1962ac0fb1c -r 9b635405ef90 xen/common/domctl.c
--- a/xen/common/domctl.c       Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/common/domctl.c       Wed Apr 09 11:30:32 2008 +0100
@@ -25,6 +25,8 @@
 #include <public/domctl.h>
 #include <xsm/xsm.h>
 
+DEFINE_SPINLOCK(domctl_lock);
+
 extern long arch_do_domctl(
     struct xen_domctl *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
 
@@ -180,7 +182,6 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
 {
     long ret = 0;
     struct xen_domctl curop, *op = &curop;
-    static DEFINE_SPINLOCK(domctl_lock);
 
     if ( !IS_PRIV(current->domain) )
         return -EPERM;
diff -r e1962ac0fb1c -r 9b635405ef90 xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c       Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/drivers/passthrough/vtd/iommu.c       Wed Apr 09 11:30:32 2008 +0100
@@ -23,6 +23,7 @@
 #include <xen/sched.h>
 #include <xen/xmalloc.h>
 #include <xen/domain_page.h>
+#include <asm/paging.h>
 #include <xen/iommu.h>
 #include <xen/numa.h>
 #include "iommu.h"
@@ -2057,9 +2058,42 @@ void iommu_set_pgd(struct domain *d)
     }
     p2m_table = mfn_x(pagetable_get_mfn(d->arch.phys_table));
 
+    if ( paging_mode_hap(d) )
+    {
+        int level = agaw_to_level(hd->agaw);
+        struct dma_pte *dpte = NULL;
+        mfn_t pgd_mfn;
+
+        switch ( level )
+        {
+        case VTD_PAGE_TABLE_LEVEL_3:
+            dpte = map_domain_page(p2m_table);
+            if ( !dma_pte_present(*dpte) )
+            {
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                         "iommu_set_pgd: second level wasn't there\n");
+                unmap_domain_page(dpte);
+                return;
+            }
+            pgd_mfn = _mfn(dma_pte_addr(*dpte) >> PAGE_SHIFT_4K);
+            unmap_domain_page(dpte);
+            hd->pgd = maddr_to_virt(pagetable_get_paddr(
+                pagetable_from_mfn(pgd_mfn)));
+            break;
+        case VTD_PAGE_TABLE_LEVEL_4:
+            pgd_mfn = _mfn(p2m_table);
+            hd->pgd = maddr_to_virt(pagetable_get_paddr(
+                pagetable_from_mfn(pgd_mfn)));
+            break;
+        default:
+            gdprintk(XENLOG_ERR VTDPREFIX,
+                     "iommu_set_pgd:Unsupported p2m table sharing level!\n");
+            break;
+        }
+    }
+    else
+    {
 #if CONFIG_PAGING_LEVELS == 3
-    if ( !hd->pgd )
-    {
         int level = agaw_to_level(hd->agaw);
         struct dma_pte *pmd = NULL;
         struct dma_pte *pgd = NULL;
@@ -2125,10 +2159,7 @@ void iommu_set_pgd(struct domain *d)
         }
         unmap_domain_page(l3e);
         spin_unlock_irqrestore(&hd->mapping_lock, flags);
-    }
 #elif CONFIG_PAGING_LEVELS == 4
-    if ( !hd->pgd )
-    {
         int level = agaw_to_level(hd->agaw);
         l3_pgentry_t *l3e;
         mfn_t pgd_mfn;
@@ -2160,8 +2191,8 @@ void iommu_set_pgd(struct domain *d)
                      "iommu_set_pgd:Unsupported p2m table sharing level!\n");
             break;
         }
-    }
 #endif
+    }
     gdprintk(XENLOG_INFO VTDPREFIX,
              "iommu_set_pgd: hd->pgd = %p\n", hd->pgd);
 }
diff -r e1962ac0fb1c -r 9b635405ef90 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/include/asm-x86/domain.h      Wed Apr 09 11:30:32 2008 +0100
@@ -138,27 +138,6 @@ struct hap_domain {
 };
 
 /************************************************/
-/*       p2m handling                           */
-/************************************************/
-struct p2m_domain {
-    /* Lock that protects updates to the p2m */
-    spinlock_t         lock;
-    int                locker;   /* processor which holds the lock */
-    const char        *locker_function; /* Func that took it */
-
-    /* Pages used to construct the p2m */
-    struct list_head   pages;
-
-    /* Functions to call to get or free pages for the p2m */
-    struct page_info * (*alloc_page  )(struct domain *d);
-    void               (*free_page   )(struct domain *d,
-                                       struct page_info *pg);
-
-    /* Highest guest frame that's ever been mapped in the p2m */
-    unsigned long max_mapped_pfn;
-};
-
-/************************************************/
 /*       common paging data structure           */
 /************************************************/
 struct log_dirty_domain {
@@ -208,6 +187,8 @@ struct paging_vcpu {
     struct shadow_vcpu shadow;
 };
 
+struct p2m_domain;
+
 struct arch_domain
 {
     l1_pgentry_t *mm_perdomain_pt;
@@ -232,7 +213,7 @@ struct arch_domain
     struct hvm_domain hvm_domain;
 
     struct paging_domain paging;
-    struct p2m_domain p2m ;
+    struct p2m_domain *p2m;
 
     /* Shadow translated domain: P2M mapping */
     pagetable_t phys_table;
diff -r e1962ac0fb1c -r 9b635405ef90 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h  Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/include/asm-x86/hvm/domain.h  Wed Apr 09 11:30:32 2008 +0100
@@ -28,6 +28,8 @@
 #include <asm/hvm/vioapic.h>
 #include <asm/hvm/io.h>
 #include <xen/hvm/iommu.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <asm/hvm/svm/vmcb.h>
 #include <public/hvm/params.h>
 #include <public/hvm/save.h>
 
@@ -60,8 +62,6 @@ struct hvm_domain {
 
     uint64_t               params[HVM_NR_PARAMS];
 
-    unsigned long          vmx_apic_access_mfn;
-
     /* Memory ranges with pinned cache attributes. */
     struct list_head       pinned_cacheattr_ranges;
 
@@ -74,11 +74,13 @@ struct hvm_domain {
     /* Pass-through */
     struct hvm_iommu       hvm_iommu;
 
-#if CONFIG_PAGING_LEVELS == 3
-    bool_t                 amd_npt_4gb_warning;
-#endif
     bool_t                 hap_enabled;
     bool_t                 qemu_mapcache_invalidate;
+
+    union {
+        struct vmx_domain vmx;
+        struct svm_domain svm;
+    };
 };
 
 #endif /* __ASM_X86_HVM_DOMAIN_H__ */
diff -r e1962ac0fb1c -r 9b635405ef90 xen/include/asm-x86/hvm/svm/vmcb.h
--- a/xen/include/asm-x86/hvm/svm/vmcb.h        Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h        Wed Apr 09 11:30:32 2008 +0100
@@ -444,6 +444,12 @@ struct vmcb_struct {
     u64 res16[301];
 } __attribute__ ((packed));
 
+struct svm_domain {
+#if CONFIG_PAGING_LEVELS == 3
+    bool_t npt_4gb_warning;
+#endif
+};
+
 struct arch_svm_struct {
     struct vmcb_struct *vmcb;
     u64    vmcb_pa;
diff -r e1962ac0fb1c -r 9b635405ef90 xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h        Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h        Wed Apr 09 11:30:32 2008 +0100
@@ -51,6 +51,23 @@ struct vmx_msr_state {
 struct vmx_msr_state {
     unsigned long flags;
     unsigned long msrs[VMX_MSR_COUNT];
+};
+
+#define EPT_DEFAULT_MT      6
+#define EPT_DEFAULT_GAW     3
+
+struct vmx_domain {
+    unsigned long apic_access_mfn;
+
+    union {
+        struct {
+            u64 etmt :3,
+                gaw  :3,
+                rsvd :6,
+                asr  :52;
+        };
+        u64 eptp;
+    } ept_control;
 };
 
 struct arch_vmx_struct {
@@ -71,6 +88,7 @@ struct arch_vmx_struct {
 
     /* Cache of cpu execution control. */
     u32                  exec_control;
+    u32                  secondary_exec_control;
 
     /* PMU */
     struct vpmu_struct   vpmu;
@@ -108,6 +126,8 @@ void vmx_vmcs_exit(struct vcpu *v);
 #define CPU_BASED_MWAIT_EXITING               0x00000400
 #define CPU_BASED_RDPMC_EXITING               0x00000800
 #define CPU_BASED_RDTSC_EXITING               0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING            0x00008000
+#define CPU_BASED_CR3_STORE_EXITING           0x00010000
 #define CPU_BASED_CR8_LOAD_EXITING            0x00080000
 #define CPU_BASED_CR8_STORE_EXITING           0x00100000
 #define CPU_BASED_TPR_SHADOW                  0x00200000
@@ -136,6 +156,7 @@ extern u32 vmx_vmentry_control;
 extern u32 vmx_vmentry_control;
 
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT               0x00000002
 #define SECONDARY_EXEC_WBINVD_EXITING           0x00000040
 extern u32 vmx_secondary_exec_control;
 
@@ -151,6 +172,10 @@ extern bool_t cpu_has_vmx_ins_outs_instr
     (vmx_pin_based_exec_control & PIN_BASED_VIRTUAL_NMIS)
 #define cpu_has_vmx_msr_bitmap \
     (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP)
+#define cpu_has_vmx_secondary_exec_control \
+    (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+#define cpu_has_vmx_ept \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)
 
 /* GUEST_INTERRUPTIBILITY_INFO flags. */
 #define VMX_INTR_SHADOW_STI             0x00000001
@@ -192,11 +217,23 @@ enum vmcs_field {
     VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
     VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
     APIC_ACCESS_ADDR                = 0x00002014,
-    APIC_ACCESS_ADDR_HIGH           = 0x00002015, 
+    APIC_ACCESS_ADDR_HIGH           = 0x00002015,
+    EPT_POINTER                     = 0x0000201a,
+    EPT_POINTER_HIGH                = 0x0000201b,
+    GUEST_PHYSICAL_ADDRESS          = 0x00002400,
+    GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
     VMCS_LINK_POINTER               = 0x00002800,
     VMCS_LINK_POINTER_HIGH          = 0x00002801,
     GUEST_IA32_DEBUGCTL             = 0x00002802,
     GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+    GUEST_PDPTR0                    = 0x0000280a,
+    GUEST_PDPTR0_HIGH               = 0x0000280b,
+    GUEST_PDPTR1                    = 0x0000280c,
+    GUEST_PDPTR1_HIGH               = 0x0000280d,
+    GUEST_PDPTR2                    = 0x0000280e,
+    GUEST_PDPTR2_HIGH               = 0x0000280f,
+    GUEST_PDPTR3                    = 0x00002810,
+    GUEST_PDPTR3_HIGH               = 0x00002811,
     PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
     CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
     EXCEPTION_BITMAP                = 0x00004004,
diff -r e1962ac0fb1c -r 9b635405ef90 xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h Wed Apr 09 11:30:32 2008 +0100
@@ -23,9 +23,27 @@
 #include <asm/types.h>
 #include <asm/regs.h>
 #include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/hvm/support.h>
+#include <asm/hvm/trace.h>
 #include <asm/hvm/vmx/vmcs.h>
-#include <asm/i387.h>
-#include <asm/hvm/trace.h>
+
+typedef union {
+    struct {
+        u64 r       :   1,
+        w           :   1,
+        x           :   1,
+        emt         :   4,
+        sp_avail    :   1,
+        avail1      :   4,
+        mfn         :   45,
+        rsvd        :   5,
+        avail2      :   2;
+    };
+    u64 epte;
+} ept_entry_t;
+
+#define EPT_TABLE_ORDER     9
 
 void vmx_asm_vmexit_handler(struct cpu_user_regs);
 void vmx_asm_do_vmentry(void);
@@ -80,6 +98,8 @@ void vmx_realmode(struct cpu_user_regs *
 #define EXIT_REASON_MACHINE_CHECK       41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_EPT_VIOLATION       48
+#define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD              54
 
 /*
@@ -143,12 +163,14 @@ void vmx_realmode(struct cpu_user_regs *
 #define VMREAD_OPCODE   ".byte 0x0f,0x78\n"
 #define VMRESUME_OPCODE ".byte 0x0f,0x01,0xc3\n"
 #define VMWRITE_OPCODE  ".byte 0x0f,0x79\n"
+#define INVEPT_OPCODE   ".byte 0x66,0x0f,0x38,0x80\n"   /* m128,r64/32 */
 #define VMXOFF_OPCODE   ".byte 0x0f,0x01,0xc4\n"
 #define VMXON_OPCODE    ".byte 0xf3,0x0f,0xc7\n"
 
+#define MODRM_EAX_08    ".byte 0x08\n" /* ECX, [EAX] */
 #define MODRM_EAX_06    ".byte 0x30\n" /* [EAX], with reg/opcode: /6 */
 #define MODRM_EAX_07    ".byte 0x38\n" /* [EAX], with reg/opcode: /7 */
-#define MODRM_EAX_ECX   ".byte 0xc1\n" /* [EAX], [ECX] */
+#define MODRM_EAX_ECX   ".byte 0xc1\n" /* EAX, ECX */
 
 static inline void __vmptrld(u64 addr)
 {
@@ -231,6 +253,31 @@ static inline void __vm_clear_bit(unsign
 {
     __vmwrite(field, __vmread(field) & ~(1UL << bit));
 }
+
+static inline void __invept(int ext, u64 eptp, u64 gpa)
+{
+    struct {
+        u64 eptp, gpa;
+    } operand = {eptp, gpa};
+
+    __asm__ __volatile__ ( INVEPT_OPCODE
+                           MODRM_EAX_08
+                           /* CF==1 or ZF==1 --> rc = -1 */
+                           "ja 1f ; ud2 ; 1:\n"
+                           :
+                           : "a" (&operand), "c" (ext)
+                           : "memory");
+}
+
+static inline void ept_sync_all(void)
+{
+    if ( !current->domain->arch.hvm_domain.hap_enabled )
+        return;
+
+    __invept(2, 0, 0);
+}
+
+void ept_sync_domain(struct domain *d);
 
 static inline void __vmxoff(void)
 {
@@ -265,4 +312,6 @@ void vmx_inject_extint(struct vcpu *v, i
 void vmx_inject_extint(struct vcpu *v, int trap);
 void vmx_inject_nmi(struct vcpu *v);
 
+void ept_p2m_init(struct domain *d);
+
 #endif /* __ASM_X86_HVM_VMX_VMX_H__ */
diff -r e1962ac0fb1c -r 9b635405ef90 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/include/asm-x86/p2m.h Wed Apr 09 11:30:32 2008 +0100
@@ -26,6 +26,8 @@
 #ifndef _XEN_P2M_H
 #define _XEN_P2M_H
 
+#include <xen/config.h>
+#include <xen/paging.h>
 
 /*
  * The phys_to_machine_mapping maps guest physical frame numbers 
@@ -86,54 +88,49 @@ typedef enum {
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
 #define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
 
+struct p2m_domain {
+    /* Lock that protects updates to the p2m */
+    spinlock_t         lock;
+    int                locker;   /* processor which holds the lock */
+    const char        *locker_function; /* Func that took it */
+
+    /* Pages used to construct the p2m */
+    struct list_head   pages;
+
+    /* Functions to call to get or free pages for the p2m */
+    struct page_info * (*alloc_page  )(struct domain *d);
+    void               (*free_page   )(struct domain *d,
+                                       struct page_info *pg);
+    int                (*set_entry   )(struct domain *d, unsigned long gfn,
+                                       mfn_t mfn, p2m_type_t p2mt);
+    mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn,
+                                       p2m_type_t *p2mt);
+    mfn_t              (*get_entry_current)(unsigned long gfn,
+                                            p2m_type_t *p2mt);
+
+    /* Highest guest frame that's ever been mapped in the p2m */
+    unsigned long max_mapped_pfn;
+};
+
 /* Extract the type from the PTE flags that store it */
 static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
 {
     /* Type is stored in the "available" bits, 9, 10 and 11 */
     return (flags >> 9) & 0x7;
 }
- 
-/* Read the current domain's p2m table (through the linear mapping). */
+
+/* Read the current domain's p2m table. */
 static inline mfn_t gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
 {
-    mfn_t mfn = _mfn(INVALID_MFN);
-    p2m_type_t p2mt = p2m_mmio_dm;
-    /* XXX This is for compatibility with the old model, where anything not 
-     * XXX marked as RAM was considered to be emulated MMIO space.
-     * XXX Once we start explicitly registering MMIO regions in the p2m 
-     * XXX we will return p2m_invalid for unmapped gfns */
-
-    if ( gfn <= current->domain->arch.p2m.max_mapped_pfn )
-    {
-        l1_pgentry_t l1e = l1e_empty();
-        int ret;
-
-        ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
-               / sizeof(l1_pgentry_t));
-
-        /* Need to __copy_from_user because the p2m is sparse and this
-         * part might not exist */
-        ret = __copy_from_user(&l1e,
-                               &phys_to_machine_mapping[gfn],
-                               sizeof(l1e));
-
-        if ( ret == 0 ) {
-            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
-            if ( p2m_is_valid(p2mt) )
-                mfn = _mfn(l1e_get_pfn(l1e));
-            else 
-                /* XXX see above */
-                p2mt = p2m_mmio_dm;
-        }
-    }
-
-    *t = p2mt;
-    return mfn;
+    return current->domain->arch.p2m->get_entry_current(gfn, t);
 }
 
 /* Read another domain's P2M table, mapping pages as we go */
-mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t);
+static inline
+mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+    return d->arch.p2m->get_entry(d, gfn, t);
+}
 
 /* General conversion function from gfn to mfn */
 #define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), (g), (t))
@@ -149,7 +146,7 @@ static inline mfn_t _gfn_to_mfn(struct d
     }
     if ( likely(current->domain == d) )
         return gfn_to_mfn_current(gfn, t);
-    else 
+    else
         return gfn_to_mfn_foreign(d, gfn, t);
 }
 
@@ -185,7 +182,7 @@ gl1e_to_ml1e(struct domain *d, l1_pgentr
 
 
 /* Init the datastructures for later use by the p2m code */
-void p2m_init(struct domain *d);
+int p2m_init(struct domain *d);
 
 /* Allocate a new p2m table for a domain. 
  *
@@ -199,6 +196,7 @@ int p2m_alloc_table(struct domain *d,
 
 /* Return all the p2m resources to Xen. */
 void p2m_teardown(struct domain *d);
+void p2m_final_teardown(struct domain *d);
 
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
diff -r e1962ac0fb1c -r 9b635405ef90 xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h      Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/include/asm-x86/paging.h      Wed Apr 09 11:30:32 2008 +0100
@@ -183,7 +183,7 @@ void paging_vcpu_init(struct vcpu *v);
 
 /* Set up the paging-assistance-specific parts of a domain struct at
  * start of day.  Called for every domain from arch_domain_create() */
-void paging_domain_init(struct domain *d);
+int paging_domain_init(struct domain *d);
 
 /* Handler for paging-control ops: operations from user-space to enable
  * and disable ephemeral shadow modes (test mode and log-dirty mode) and
diff -r e1962ac0fb1c -r 9b635405ef90 xen/include/public/hvm/params.h
--- a/xen/include/public/hvm/params.h   Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/include/public/hvm/params.h   Wed Apr 09 11:30:32 2008 +0100
@@ -83,7 +83,8 @@
 
 /* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */
 #define HVM_PARAM_HPET_ENABLED 11
+#define HVM_PARAM_IDENT_PT     12
 
-#define HVM_NR_PARAMS          12
+#define HVM_NR_PARAMS          13
 
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff -r e1962ac0fb1c -r 9b635405ef90 xen/include/xen/hypercall.h
--- a/xen/include/xen/hypercall.h       Tue Apr 08 11:41:27 2008 +0100
+++ b/xen/include/xen/hypercall.h       Wed Apr 09 11:30:32 2008 +0100
@@ -30,6 +30,7 @@ do_sched_op(
     int cmd,
     XEN_GUEST_HANDLE(void) arg);
 
+extern spinlock_t domctl_lock;
 extern long
 do_domctl(
     XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] x86, vmx: Enable EPT (Extended PageTable) support on new Intel processors., Xen patchbot-unstable <=